sql_cli/web/
http_fetcher.rs

1// HTTP fetcher for WEB CTEs (now supports file:// URLs too)
2use anyhow::{Context, Result};
3use regex::Regex;
4use std::fs::File;
5use std::io::Cursor;
6use std::path::Path;
7use std::time::Duration;
8use tracing::{debug, info};
9
10use crate::data::datatable::DataTable;
11use crate::data::stream_loader::{load_csv_from_reader, load_json_from_reader};
12use crate::sql::parser::ast::{DataFormat, WebCTESpec};
13
14/// Fetches data from a URL and converts it to a DataTable
15pub struct WebDataFetcher {
16    client: reqwest::blocking::Client,
17}
18
19impl WebDataFetcher {
20    pub fn new() -> Result<Self> {
21        let client = reqwest::blocking::Client::builder()
22            .timeout(Duration::from_secs(30))
23            .user_agent("sql-cli/1.0")
24            .build()?;
25
26        Ok(Self { client })
27    }
28
29    /// Fetch data from a WEB CTE specification (supports http://, https://, and file:// URLs)
30    pub fn fetch(&self, spec: &WebCTESpec, table_name: &str) -> Result<DataTable> {
31        info!("Fetching data from URL: {}", spec.url);
32
33        // Check if this is a file:// URL
34        if spec.url.starts_with("file://") {
35            return self.fetch_file(spec, table_name);
36        }
37
38        // Regular HTTP/HTTPS handling
39        // Build request
40        let mut request = self.client.get(&spec.url);
41
42        // Add headers if provided
43        for (key, value) in &spec.headers {
44            let resolved_value = self.resolve_env_var(value)?;
45            request = request.header(key, resolved_value);
46        }
47
48        // Execute request
49        let response = request
50            .send()
51            .with_context(|| format!("Failed to fetch from URL: {}", spec.url))?;
52
53        // Check status
54        if !response.status().is_success() {
55            return Err(anyhow::anyhow!(
56                "HTTP request failed with status {}: {}",
57                response.status(),
58                spec.url
59            ));
60        }
61
62        // Get content type for format detection
63        let content_type = response
64            .headers()
65            .get("content-type")
66            .and_then(|v| v.to_str().ok())
67            .unwrap_or("")
68            .to_string();
69
70        debug!("Response content-type: {}", content_type);
71
72        // Read response body
73        let bytes = response.bytes()?;
74
75        // Determine format
76        let format = match &spec.format {
77            Some(fmt) => fmt.clone(),
78            None => self.detect_format(&spec.url, &content_type),
79        };
80
81        info!("Using format: {:?} for {}", format, spec.url);
82
83        // Parse based on format
84        self.parse_data(bytes.to_vec(), format, table_name, "web", &spec.url)
85    }
86
87    /// Fetch data from a file:// URL
88    fn fetch_file(&self, spec: &WebCTESpec, table_name: &str) -> Result<DataTable> {
89        // Extract path from file:// URL
90        let file_path = if spec.url.starts_with("file://") {
91            &spec.url[7..] // Remove "file://" prefix
92        } else {
93            &spec.url
94        };
95
96        info!("Reading local file: {}", file_path);
97
98        // Check if file exists
99        let path = Path::new(file_path);
100        if !path.exists() {
101            return Err(anyhow::anyhow!("File not found: {}", file_path));
102        }
103
104        // Open file
105        let file =
106            File::open(path).with_context(|| format!("Failed to open file: {}", file_path))?;
107
108        // Get file size for memory checks
109        let metadata = file.metadata()?;
110        let file_size = metadata.len();
111        debug!("File size: {} bytes", file_size);
112
113        // Determine format from extension or spec
114        let format = match &spec.format {
115            Some(fmt) => fmt.clone(),
116            None => self.detect_format(file_path, ""),
117        };
118
119        info!("Using format: {:?} for {}", format, file_path);
120
121        // Parse based on format
122        match format {
123            DataFormat::CSV => load_csv_from_reader(file, table_name, "file", file_path)
124                .with_context(|| format!("Failed to parse CSV from {}", file_path)),
125            DataFormat::JSON => load_json_from_reader(file, table_name, "file", file_path)
126                .with_context(|| format!("Failed to parse JSON from {}", file_path)),
127            DataFormat::Auto => {
128                // For files, we can't retry with the same reader, so determine based on extension
129                if file_path.ends_with(".json") {
130                    let file = File::open(path)?;
131                    load_json_from_reader(file, table_name, "file", file_path)
132                        .with_context(|| format!("Failed to parse JSON from {}", file_path))
133                } else {
134                    // Default to CSV for auto-detect with files
135                    let file = File::open(path)?;
136                    load_csv_from_reader(file, table_name, "file", file_path)
137                        .with_context(|| format!("Failed to parse CSV from {}", file_path))
138                }
139            }
140        }
141    }
142
143    /// Parse data bytes based on format
144    fn parse_data(
145        &self,
146        bytes: Vec<u8>,
147        format: DataFormat,
148        table_name: &str,
149        source_type: &str,
150        source_path: &str,
151    ) -> Result<DataTable> {
152        match format {
153            DataFormat::CSV => {
154                let reader = Cursor::new(bytes);
155                load_csv_from_reader(reader, table_name, source_type, source_path)
156                    .with_context(|| format!("Failed to parse CSV from {}", source_path))
157            }
158            DataFormat::JSON => {
159                let reader = Cursor::new(bytes);
160                load_json_from_reader(reader, table_name, source_type, source_path)
161                    .with_context(|| format!("Failed to parse JSON from {}", source_path))
162            }
163            DataFormat::Auto => {
164                // Try CSV first, then JSON
165                let reader_csv = Cursor::new(bytes.clone());
166                match load_csv_from_reader(reader_csv, table_name, source_type, source_path) {
167                    Ok(table) => Ok(table),
168                    Err(_) => {
169                        debug!("CSV parsing failed, trying JSON");
170                        let reader_json = Cursor::new(bytes);
171                        load_json_from_reader(reader_json, table_name, source_type, source_path)
172                            .with_context(|| format!("Failed to parse data from {}", source_path))
173                    }
174                }
175            }
176        }
177    }
178
179    /// Detect format from URL extension or content type
180    fn detect_format(&self, url: &str, content_type: &str) -> DataFormat {
181        // Check content type first
182        if content_type.contains("json") {
183            return DataFormat::JSON;
184        }
185        if content_type.contains("csv") || content_type.contains("text/plain") {
186            return DataFormat::CSV;
187        }
188
189        // Check URL extension
190        if url.ends_with(".json") {
191            DataFormat::JSON
192        } else if url.ends_with(".csv") {
193            DataFormat::CSV
194        } else {
195            // Default to auto-detect
196            DataFormat::Auto
197        }
198    }
199
200    /// Resolve environment variables in values (${VAR_NAME} or $VAR_NAME syntax)
201    fn resolve_env_var(&self, value: &str) -> Result<String> {
202        let mut result = value.to_string();
203
204        // Handle ${VAR} syntax - can be embedded in strings
205        // Use lazy_static for better performance, but for now just compile inline
206        let re = Regex::new(r"\$\{([^}]+)\}").unwrap();
207        for cap in re.captures_iter(value) {
208            let var_name = &cap[1];
209            match std::env::var(var_name) {
210                Ok(var_value) => {
211                    result = result.replace(&cap[0], &var_value);
212                }
213                Err(_) => {
214                    // For security, don't expose which env vars exist
215                    // Just log a debug message and keep the placeholder
216                    debug!(
217                        "Environment variable {} not found, keeping placeholder",
218                        var_name
219                    );
220                }
221            }
222        }
223
224        // Also handle simple $VAR syntax at the start of the string
225        if result.starts_with('$') && !result.starts_with("${") {
226            let var_name = &result[1..];
227            if let Ok(var_value) = std::env::var(var_name) {
228                return Ok(var_value);
229            }
230        }
231
232        Ok(result)
233    }
234}
235
236#[cfg(test)]
237mod tests {
238    use super::*;
239
240    #[test]
241    fn test_detect_format() {
242        let fetcher = WebDataFetcher::new().unwrap();
243
244        // Test URL-based detection
245        assert!(matches!(
246            fetcher.detect_format("http://example.com/data.csv", ""),
247            DataFormat::CSV
248        ));
249        assert!(matches!(
250            fetcher.detect_format("http://example.com/data.json", ""),
251            DataFormat::JSON
252        ));
253
254        // Test content-type detection
255        assert!(matches!(
256            fetcher.detect_format("http://example.com/data", "application/json"),
257            DataFormat::JSON
258        ));
259        assert!(matches!(
260            fetcher.detect_format("http://example.com/data", "text/csv"),
261            DataFormat::CSV
262        ));
263
264        // Test auto-detect fallback
265        assert!(matches!(
266            fetcher.detect_format("http://example.com/data", ""),
267            DataFormat::Auto
268        ));
269    }
270}