cadi_scraper/
types.rs

1use serde::{Deserialize, Serialize};
2use std::collections::HashMap;
3use std::path::PathBuf;
4
5/// Configuration for the scraper
6#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct ScraperConfig {
8    /// Registry URL for publishing chunks
9    pub registry_url: Option<String>,
10
11    /// Authentication token for registry
12    pub auth_token: Option<String>,
13
14    /// Namespace for published chunks
15    pub namespace: Option<String>,
16
17    /// Chunking strategy to use
18    pub chunking_strategy: ChunkingStrategy,
19
20    /// Maximum chunk size in bytes
21    pub max_chunk_size: usize,
22
23    /// Include overlapping context between chunks
24    pub include_overlap: bool,
25
26    /// Overlap size in characters
27    pub overlap_size: usize,
28
29    /// Language-specific parsing options
30    pub language_options: HashMap<String, LanguageConfig>,
31
32    /// Patterns to exclude from scraping
33    pub exclude_patterns: Vec<String>,
34
35    /// Whether to create hierarchical chunk relationships
36    pub create_hierarchy: bool,
37
38    /// Auto-extract API surfaces and functions
39    pub extract_api_surface: bool,
40
41    /// Whether to auto-detect licenses
42    pub detect_licenses: bool,
43
44    /// HTTP request timeout in seconds
45    pub request_timeout: u64,
46
47    /// Rate limit: requests per second
48    pub rate_limit: f64,
49
50    /// Local cache directory
51    pub cache_dir: Option<PathBuf>,
52}
53
54/// Language-specific configuration
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct LanguageConfig {
57    /// Minimum chunk size for semantic boundaries
58    pub min_semantic_size: usize,
59
60    /// Whether to split by function/class boundaries
61    pub split_by_semantic_boundary: bool,
62
63    /// Extract function signatures
64    pub extract_functions: bool,
65
66    /// Extract type definitions
67    pub extract_types: bool,
68
69    /// Extract class/struct definitions
70    pub extract_classes: bool,
71}
72
73/// Chunking strategy
74#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
75#[derive(Default)]
76pub enum ChunkingStrategy {
77    /// Chunk by individual file
78    #[default]
79    ByFile,
80
81    /// Chunk by semantic boundaries (functions, classes)
82    Semantic,
83
84    /// Chunk by fixed size
85    FixedSize,
86
87    /// Recursive hierarchical chunking
88    Hierarchical,
89
90    /// Chunk by line count
91    ByLineCount,
92}
93
94
95impl Default for ScraperConfig {
96    fn default() -> Self {
97        let mut language_options = HashMap::new();
98        
99        // Default configs for common languages
100        for lang in &["rust", "typescript", "python", "javascript", "go", "c"] {
101            language_options.insert(
102                lang.to_string(),
103                LanguageConfig {
104                    min_semantic_size: 100,
105                    split_by_semantic_boundary: true,
106                    extract_functions: true,
107                    extract_types: true,
108                    extract_classes: true,
109                },
110            );
111        }
112
113        Self {
114            registry_url: None,
115            auth_token: None,
116            namespace: None,
117            chunking_strategy: ChunkingStrategy::ByFile,
118            max_chunk_size: 50 * 1024 * 1024, // 50MB default
119            include_overlap: true,
120            overlap_size: 500,
121            language_options,
122            exclude_patterns: vec![
123                "**/.git".to_string(),
124                "**/node_modules".to_string(),
125                "**/target".to_string(),
126                "**/.venv".to_string(),
127                "**/dist".to_string(),
128                "**/build".to_string(),
129                "**/.DS_Store".to_string(),
130            ],
131            create_hierarchy: true,
132            extract_api_surface: true,
133            detect_licenses: true,
134            request_timeout: 30,
135            rate_limit: 10.0,
136            cache_dir: None,
137        }
138    }
139}
140
141/// Input to the scraper
142#[derive(Debug, Clone, Serialize, Deserialize)]
143pub enum ScraperInput {
144    /// Local file path
145    LocalPath(PathBuf),
146
147    /// HTTP(S) URL
148    Url(String),
149
150    /// Git repository URL
151    GitRepo {
152        url: String,
153        #[serde(skip_serializing_if = "Option::is_none")]
154        branch: Option<String>,
155        #[serde(skip_serializing_if = "Option::is_none")]
156        commit: Option<String>,
157    },
158
159    /// Directory path with optional filters
160    Directory {
161        path: PathBuf,
162        #[serde(skip_serializing_if = "Option::is_none")]
163        patterns: Option<Vec<String>>,
164    },
165}
166
167/// Output from the scraper
168#[derive(Debug, Clone, Serialize, Deserialize)]
169pub struct ScraperOutput {
170    /// Total chunks created
171    pub chunk_count: usize,
172
173    /// Total files processed
174    pub file_count: usize,
175
176    /// Total bytes processed
177    pub total_bytes: u64,
178
179    /// Chunks with their metadata
180    pub chunks: Vec<ScrapedChunk>,
181
182    /// Manifest for all chunks
183    pub manifest: Option<serde_json::Value>,
184
185    /// Errors encountered during scraping
186    pub errors: Vec<String>,
187
188    /// Time taken (milliseconds)
189    pub duration_ms: u128,
190}
191
192/// A single scraped chunk with metadata
193#[derive(Debug, Clone, Serialize, Deserialize)]
194pub struct ScrapedChunk {
195    /// Chunk ID (content hash)
196    pub chunk_id: String,
197
198    /// Chunk type (source-cadi, etc)
199    pub cadi_type: String,
200
201    /// Human-readable name
202    pub name: String,
203
204    /// Description
205    pub description: Option<String>,
206
207    /// Source path or URL
208    pub source: String,
209
210    /// Content hash
211    pub content_hash: String,
212
213    /// File size
214    pub size: usize,
215
216    /// Language/format detected
217    pub language: Option<String>,
218
219    /// Detected concepts/tags
220    pub concepts: Vec<String>,
221
222    /// Detected dependencies
223    pub dependencies: Vec<String>,
224
225    /// License if detected
226    pub license: Option<String>,
227
228    /// Parent chunk ID for hierarchical relationships
229    pub parent_chunk_id: Option<String>,
230
231    /// Child chunk IDs
232    pub child_chunk_ids: Vec<String>,
233
234    /// Metadata tags
235    pub tags: Vec<String>,
236
237    /// Timestamp when scraped
238    pub scraped_at: String,
239}