datafold/ingestion/
json_processor.rs

1//! JSON conversion and processing for file uploads
2
3use file_to_json::{Converter, FallbackStrategy, OpenRouterConfig};
4use serde_json::{json, Value};
5use std::io::Write;
6use std::path::PathBuf;
7use std::time::Duration;
8use tempfile::NamedTempFile;
9
10use crate::ingestion::config::AIProvider;
11use crate::ingestion::IngestionError;
12use crate::log_feature;
13use crate::logging::features::LogFeature;
14
15/// Convert a file to JSON using file_to_json library (core implementation)
16async fn convert_file_to_json_core(file_path: &PathBuf) -> Result<Value, IngestionError> {
17    log_feature!(
18        LogFeature::Ingestion,
19        info,
20        "Converting file to JSON: {:?}",
21        file_path
22    );
23
24    // Load fold_db ingestion config
25    let ingestion_config = crate::ingestion::IngestionConfig::from_env()?;
26
27    // Only OpenRouter is supported for file_to_json conversion
28    if ingestion_config.provider != AIProvider::OpenRouter {
29        return Err(IngestionError::configuration_error(
30            "File conversion requires OpenRouter provider. Ollama is not supported for this feature."
31        ));
32    }
33
34    // Build file_to_json OpenRouterConfig from fold_db config
35    let file_to_json_config = OpenRouterConfig {
36        api_key: ingestion_config.openrouter.api_key.clone(),
37        model: ingestion_config.openrouter.model.clone(),
38        timeout: Duration::from_secs(ingestion_config.timeout_seconds),
39        fallback_strategy: FallbackStrategy::Chunked,
40        vision_model: Some(ingestion_config.openrouter.model.clone()),
41        max_image_bytes: 5 * 1024 * 1024, // 5MB default
42    };
43
44    let file_path_str = file_path.to_string_lossy().to_string();
45    
46    // Run conversion in blocking task
47    tokio::task::spawn_blocking(move || {
48        let converter = Converter::new(file_to_json_config)
49            .map_err(|_| IngestionError::FileConversionFailed)?;
50        converter.convert_path(&file_path_str)
51            .map_err(|e| {
52                log_feature!(
53                    LogFeature::Ingestion,
54                    error,
55                    "Failed to convert file to JSON: {}",
56                    e
57                );
58                IngestionError::FileConversionFailed
59            })
60    })
61    .await
62    .map_err(|e| {
63        log_feature!(
64            LogFeature::Ingestion,
65            error,
66            "Failed to spawn blocking task: {}",
67            e
68        );
69        IngestionError::FileConversionFailed
70    })?
71}
72
73/// Convert a file to JSON using file_to_json library (public API for ingestion)
74pub async fn convert_file_to_json(file_path: &PathBuf) -> Result<Value, IngestionError> {
75    convert_file_to_json_core(file_path).await
76}
77
78/// Convert a file to JSON using file_to_json library (actix-web wrapper)
79pub async fn convert_file_to_json_http(
80    file_path: &PathBuf,
81) -> Result<Value, actix_web::HttpResponse> {
82    use actix_web::HttpResponse;
83
84    match convert_file_to_json_core(file_path).await {
85        Ok(value) => Ok(value),
86        Err(e) => {
87            log_feature!(LogFeature::Ingestion, error, "File conversion failed: {}", e);
88            Err(HttpResponse::InternalServerError().json(json!({
89                "success": false,
90                "error": format!("Failed to convert file to JSON: {}", e)
91            })))
92        }
93    }
94}
95
96/// Flatten JSON structures with unnecessary root layers
97/// Handles patterns:
98/// 1. root -> array: {"key": [...]} => [...]
99/// 2. root -> root -> array: {"key1": {"key2": [...]}} => [...]
100/// 3. array elements with single-field wrappers: [{"wrapper": {...}}] => [{...}]
101/// 4. direct arrays with single-field wrappers: [...] => [...]
102pub fn flatten_root_layers(json: Value) -> Value {
103    // Check if it's already an array - flatten its elements
104    if json.is_array() {
105        log_feature!(
106            LogFeature::Ingestion,
107            info,
108            "Flattening array elements with single-field wrappers"
109        );
110        return flatten_array_elements(json);
111    }
112    
113    // Check for root -> array pattern
114    if let Value::Object(ref map) = json {
115        // If object has exactly one field
116        if map.len() == 1 {
117            let (key, value) = map.iter().next().unwrap();
118            
119            // If that field is an array, flatten the array and its elements
120            if value.is_array() {
121                log_feature!(
122                    LogFeature::Ingestion,
123                    info,
124                    "Flattening root->array pattern: removing '{}' wrapper",
125                    key
126                );
127                return flatten_array_elements(value.clone());
128            }
129            
130            // Check for root -> root -> array pattern
131            if let Value::Object(ref inner_map) = value {
132                if inner_map.len() == 1 {
133                    let (inner_key, inner_value) = inner_map.iter().next().unwrap();
134                    if inner_value.is_array() {
135                        log_feature!(
136                            LogFeature::Ingestion,
137                            info,
138                            "Flattening root->root->array pattern: removing '{}'->'{}' wrappers",
139                            key,
140                            inner_key
141                        );
142                        return flatten_array_elements(inner_value.clone());
143                    }
144                }
145            }
146        }
147    }
148    
149    // No flattening needed
150    json
151}
152
153/// Flatten array elements that have unnecessary single-field wrapper objects
154fn flatten_array_elements(value: Value) -> Value {
155    if let Value::Array(arr) = value {
156        let flattened_elements: Vec<Value> = arr
157            .into_iter()
158            .map(|element| {
159                // If element is an object with exactly one field
160                if let Value::Object(ref map) = element {
161                    if map.len() == 1 {
162                        let (key, inner_value) = map.iter().next().unwrap();
163                        
164                        // If that field contains an object (not an array or primitive),
165                        // flatten by returning the inner object
166                        if inner_value.is_object() {
167                            log_feature!(
168                                LogFeature::Ingestion,
169                                debug,
170                                "Flattening array element: removing '{}' wrapper from object",
171                                key
172                            );
173                            return inner_value.clone();
174                        }
175                    }
176                }
177                element
178            })
179            .collect();
180        
181        Value::Array(flattened_elements)
182    } else {
183        value
184    }
185}
186
187/// Add file_location metadata to JSON value
188pub fn add_file_location(json: Value, file_path: &std::path::Path) -> Value {
189    match json {
190        Value::Object(mut map) => {
191            // Add file_location directly to the object
192            map.insert(
193                "file_location".to_string(),
194                Value::String(file_path.to_string_lossy().to_string()),
195            );
196            Value::Object(map)
197        }
198        Value::Array(arr) => {
199            // Add file_location to each element in the array
200            let modified_array: Vec<Value> = arr
201                .into_iter()
202                .map(|mut item| {
203                    if let Value::Object(ref mut obj) = item {
204                        obj.insert(
205                            "file_location".to_string(),
206                            Value::String(file_path.to_string_lossy().to_string()),
207                        );
208                    }
209                    item
210                })
211                .collect();
212            Value::Array(modified_array)
213        }
214        other => {
215            // For primitives, wrap in a minimal object with file_location
216            json!({
217                "file_location": file_path.to_string_lossy().to_string(),
218                "value": other
219            })
220        }
221    }
222}
223
224/// Save JSON to a temporary file that persists for testing
225/// Returns the path to the temporary file
226pub fn save_json_to_temp_file(json: &Value) -> std::io::Result<String> {
227    // Create temp directory in system temp location (works in Lambda and locally)
228    let temp_dir = std::env::temp_dir().join("folddb_debug");
229    std::fs::create_dir_all(&temp_dir)?;
230    
231    // Create a named temporary file with .json extension
232    let temp_file = NamedTempFile::new_in(&temp_dir)?;
233    
234    // Write the JSON with pretty formatting
235    let json_string = serde_json::to_string_pretty(json)?;
236    
237    // Get a mutable handle to write
238    let mut file = temp_file.as_file();
239    file.write_all(json_string.as_bytes())?;
240    file.sync_all()?;
241    
242    // Persist the temp file so it doesn't get deleted when dropped
243    let (_file, path) = temp_file.keep()?;
244    
245    Ok(path.to_string_lossy().to_string())
246}
247
248#[cfg(test)]
249mod tests {
250    use super::*;
251
252    #[test]
253    fn test_flatten_root_to_array() {
254        let input = json!({
255            "data": [
256                {"id": 1, "name": "Alice"},
257                {"id": 2, "name": "Bob"}
258            ]
259        });
260        
261        let result = flatten_root_layers(input);
262        
263        assert!(result.is_array());
264        let arr = result.as_array().unwrap();
265        assert_eq!(arr.len(), 2);
266        assert_eq!(arr[0]["id"], 1);
267    }
268
269    #[test]
270    fn test_flatten_root_root_to_array() {
271        let input = json!({
272            "response": {
273                "items": [
274                    {"id": 1, "name": "Alice"},
275                    {"id": 2, "name": "Bob"}
276                ]
277            }
278        });
279        
280        let result = flatten_root_layers(input);
281        
282        assert!(result.is_array());
283        let arr = result.as_array().unwrap();
284        assert_eq!(arr.len(), 2);
285        assert_eq!(arr[0]["name"], "Alice");
286    }
287
288    #[test]
289    fn test_no_flatten_multiple_fields() {
290        let input = json!({
291            "data": [{"id": 1}],
292            "metadata": {"count": 1}
293        });
294        
295        let result = flatten_root_layers(input.clone());
296        
297        // Should remain unchanged
298        assert_eq!(result, input);
299    }
300
301    #[test]
302    fn test_no_flatten_nested_object() {
303        let input = json!({
304            "user": {
305                "id": 1,
306                "name": "Alice"
307            }
308        });
309        
310        let result = flatten_root_layers(input.clone());
311        
312        // Should remain unchanged
313        assert_eq!(result, input);
314    }
315
316    #[test]
317    fn test_no_flatten_direct_array() {
318        let input = json!([
319            {"id": 1, "name": "Alice"},
320            {"id": 2, "name": "Bob"}
321        ]);
322        
323        let result = flatten_root_layers(input.clone());
324        
325        // Should remain unchanged
326        assert_eq!(result, input);
327    }
328
329    #[test]
330    fn test_no_flatten_deep_nesting() {
331        let input = json!({
332            "level1": {
333                "level2": {
334                    "level3": [{"id": 1}]
335                }
336            }
337        });
338        
339        let result = flatten_root_layers(input.clone());
340        
341        // Should remain unchanged (we only flatten up to 2 levels)
342        assert_eq!(result, input);
343    }
344
345    #[test]
346    fn test_flatten_with_array_keeps_array_structure() {
347        let input = json!({
348            "data": [
349                {"id": 1, "name": "Alice"},
350                {"id": 2, "name": "Bob"}
351            ]
352        });
353        
354        let result = flatten_root_layers(input);
355        
356        // Verify it's an array, not wrapped in an object
357        assert!(result.is_array(), "Result should be an array");
358        assert!(!result.is_object(), "Result should not be wrapped in an object");
359        
360        let arr = result.as_array().unwrap();
361        assert_eq!(arr.len(), 2);
362    }
363
364    #[test]
365    fn test_add_file_location_to_object() {
366        let input = json!({"id": 1, "name": "Alice"});
367        let path = PathBuf::from("/test/file.csv");
368        
369        let result = add_file_location(input, &path);
370        
371        assert!(result.is_object());
372        let obj = result.as_object().unwrap();
373        assert_eq!(obj["file_location"], "/test/file.csv");
374        assert_eq!(obj["id"], 1);
375    }
376
377    #[test]
378    fn test_add_file_location_to_array() {
379        let input = json!([
380            {"id": 1, "name": "Alice"},
381            {"id": 2, "name": "Bob"}
382        ]);
383        let path = PathBuf::from("/test/file.csv");
384        
385        let result = add_file_location(input, &path);
386        
387        assert!(result.is_array());
388        let arr = result.as_array().unwrap();
389        assert_eq!(arr.len(), 2);
390        assert_eq!(arr[0]["file_location"], "/test/file.csv");
391        assert_eq!(arr[1]["file_location"], "/test/file.csv");
392    }
393
394    #[test]
395    fn test_flatten_array_elements_with_single_field_wrappers() {
396        let input = json!({
397            "data": [
398                {"item": {"id": 1, "name": "Alice"}},
399                {"item": {"id": 2, "name": "Bob"}}
400            ]
401        });
402        
403        let result = flatten_root_layers(input);
404        
405        assert!(result.is_array());
406        let arr = result.as_array().unwrap();
407        assert_eq!(arr.len(), 2);
408        
409        // Each array element should be flattened (no "item" wrapper)
410        assert_eq!(arr[0]["id"], 1);
411        assert_eq!(arr[0]["name"], "Alice");
412        assert!(arr[0].get("item").is_none());
413        
414        assert_eq!(arr[1]["id"], 2);
415        assert_eq!(arr[1]["name"], "Bob");
416        assert!(arr[1].get("item").is_none());
417    }
418
419    #[test]
420    fn test_flatten_array_elements_preserves_multi_field_objects() {
421        let input = json!({
422            "data": [
423                {
424                    "id": 1,
425                    "wrapper": {"name": "Alice"}
426                },
427                {
428                    "id": 2,
429                    "wrapper": {"name": "Bob"}
430                }
431            ]
432        });
433        
434        let result = flatten_root_layers(input.clone());
435        
436        // Should flatten root but NOT array elements (they have multiple fields)
437        assert!(result.is_array());
438        let arr = result.as_array().unwrap();
439        assert_eq!(arr.len(), 2);
440        assert_eq!(arr[0]["id"], 1);
441        assert!(arr[0].get("wrapper").is_some());
442    }
443
444    #[test]
445    fn test_flatten_array_elements_preserves_primitives() {
446        let input = json!({
447            "data": [
448                {"value": "Alice"},
449                {"value": 42},
450                {"value": true}
451            ]
452        });
453        
454        let result = flatten_root_layers(input);
455        
456        assert!(result.is_array());
457        let arr = result.as_array().unwrap();
458        assert_eq!(arr.len(), 3);
459        
460        // Should NOT flatten when the inner value is a primitive
461        assert_eq!(arr[0]["value"], "Alice");
462        assert_eq!(arr[1]["value"], 42);
463        assert_eq!(arr[2]["value"], true);
464    }
465
466    #[test]
467    fn test_flatten_complex_nested_structure() {
468        let input = json!({
469            "response": {
470                "items": [
471                    {"record": {"id": 1, "name": "Alice", "email": "alice@example.com"}},
472                    {"record": {"id": 2, "name": "Bob", "email": "bob@example.com"}}
473                ]
474            }
475        });
476        
477        let result = flatten_root_layers(input);
478        
479        assert!(result.is_array());
480        let arr = result.as_array().unwrap();
481        assert_eq!(arr.len(), 2);
482        
483        // Should flatten both root layers AND array element wrappers
484        assert_eq!(arr[0]["id"], 1);
485        assert_eq!(arr[0]["name"], "Alice");
486        assert!(arr[0].get("record").is_none());
487        
488        assert_eq!(arr[1]["id"], 2);
489        assert_eq!(arr[1]["name"], "Bob");
490        assert!(arr[1].get("record").is_none());
491    }
492
493    #[test]
494    fn test_flatten_direct_array_with_single_field_wrappers() {
495        // Test case for arrays returned directly by file_to_json
496        let input = json!([
497            {"tweet": {"id": 1, "text": "Hello", "user": "alice"}},
498            {"tweet": {"id": 2, "text": "World", "user": "bob"}}
499        ]);
500        
501        let result = flatten_root_layers(input);
502        
503        assert!(result.is_array());
504        let arr = result.as_array().unwrap();
505        assert_eq!(arr.len(), 2);
506        
507        // Should flatten the "tweet" wrapper from each element
508        assert_eq!(arr[0]["id"], 1);
509        assert_eq!(arr[0]["text"], "Hello");
510        assert_eq!(arr[0]["user"], "alice");
511        assert!(arr[0].get("tweet").is_none());
512        
513        assert_eq!(arr[1]["id"], 2);
514        assert_eq!(arr[1]["text"], "World");
515        assert_eq!(arr[1]["user"], "bob");
516        assert!(arr[1].get("tweet").is_none());
517    }
518}
519