1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use std::path::PathBuf;
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct ScraperConfig {
8 pub registry_url: Option<String>,
10
11 pub auth_token: Option<String>,
13
14 pub namespace: Option<String>,
16
17 pub chunking_strategy: ChunkingStrategy,
19
20 pub max_chunk_size: usize,
22
23 pub include_overlap: bool,
25
26 pub overlap_size: usize,
28
29 pub language_options: HashMap<String, LanguageConfig>,
31
32 pub exclude_patterns: Vec<String>,
34
35 pub create_hierarchy: bool,
37
38 pub extract_api_surface: bool,
40
41 pub detect_licenses: bool,
43
44 pub request_timeout: u64,
46
47 pub rate_limit: f64,
49
50 pub cache_dir: Option<PathBuf>,
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct LanguageConfig {
57 pub min_semantic_size: usize,
59
60 pub split_by_semantic_boundary: bool,
62
63 pub extract_functions: bool,
65
66 pub extract_types: bool,
68
69 pub extract_classes: bool,
71}
72
73#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
75pub enum ChunkingStrategy {
76 ByFile,
78
79 Semantic,
81
82 FixedSize,
84
85 Hierarchical,
87
88 ByLineCount,
90}
91
92impl Default for ChunkingStrategy {
93 fn default() -> Self {
94 ChunkingStrategy::ByFile
95 }
96}
97
98impl Default for ScraperConfig {
99 fn default() -> Self {
100 let mut language_options = HashMap::new();
101
102 for lang in &["rust", "typescript", "python", "javascript", "go", "c"] {
104 language_options.insert(
105 lang.to_string(),
106 LanguageConfig {
107 min_semantic_size: 100,
108 split_by_semantic_boundary: true,
109 extract_functions: true,
110 extract_types: true,
111 extract_classes: true,
112 },
113 );
114 }
115
116 Self {
117 registry_url: None,
118 auth_token: None,
119 namespace: None,
120 chunking_strategy: ChunkingStrategy::ByFile,
121 max_chunk_size: 50 * 1024 * 1024, include_overlap: true,
123 overlap_size: 500,
124 language_options,
125 exclude_patterns: vec![
126 "**/.git".to_string(),
127 "**/node_modules".to_string(),
128 "**/target".to_string(),
129 "**/.venv".to_string(),
130 "**/dist".to_string(),
131 "**/build".to_string(),
132 "**/.DS_Store".to_string(),
133 ],
134 create_hierarchy: true,
135 extract_api_surface: true,
136 detect_licenses: true,
137 request_timeout: 30,
138 rate_limit: 10.0,
139 cache_dir: None,
140 }
141 }
142}
143
144#[derive(Debug, Clone, Serialize, Deserialize)]
146pub enum ScraperInput {
147 LocalPath(PathBuf),
149
150 Url(String),
152
153 GitRepo {
155 url: String,
156 #[serde(skip_serializing_if = "Option::is_none")]
157 branch: Option<String>,
158 #[serde(skip_serializing_if = "Option::is_none")]
159 commit: Option<String>,
160 },
161
162 Directory {
164 path: PathBuf,
165 #[serde(skip_serializing_if = "Option::is_none")]
166 patterns: Option<Vec<String>>,
167 },
168}
169
170#[derive(Debug, Clone, Serialize, Deserialize)]
172pub struct ScraperOutput {
173 pub chunk_count: usize,
175
176 pub file_count: usize,
178
179 pub total_bytes: u64,
181
182 pub chunks: Vec<ScrapedChunk>,
184
185 pub manifest: Option<serde_json::Value>,
187
188 pub errors: Vec<String>,
190
191 pub duration_ms: u128,
193}
194
195#[derive(Debug, Clone, Serialize, Deserialize)]
197pub struct ScrapedChunk {
198 pub chunk_id: String,
200
201 pub cadi_type: String,
203
204 pub name: String,
206
207 pub description: Option<String>,
209
210 pub source: String,
212
213 pub content_hash: String,
215
216 pub size: usize,
218
219 pub language: Option<String>,
221
222 pub concepts: Vec<String>,
224
225 pub dependencies: Vec<String>,
227
228 pub license: Option<String>,
230
231 pub parent_chunk_id: Option<String>,
233
234 pub child_chunk_ids: Vec<String>,
236
237 pub tags: Vec<String>,
239
240 pub scraped_at: String,
242}