lc/search/
jina.rs

1use anyhow::Result;
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4
5use super::{SearchResult, SearchResults};
6
7#[derive(Debug, Serialize, Deserialize)]
8pub struct JinaSearchResult {
9    pub title: String,
10    pub url: String,
11    pub description: String,
12    #[serde(default)]
13    pub content: String,
14}
15
16#[derive(Debug, Serialize, Deserialize)]
17pub struct JinaDirectResponse {
18    pub data: Vec<JinaDirectResult>,
19}
20
21#[derive(Debug, Serialize, Deserialize)]
22pub struct JinaDirectResult {
23    pub title: String,
24    pub url: String,
25    #[serde(default)]
26    pub description: String,
27    #[serde(default)]
28    pub content: String,
29}
30
31pub struct JinaProvider {
32    pub url: String,
33    pub headers: HashMap<String, String>,
34}
35
36impl JinaProvider {
37    pub fn new(url: String, headers: HashMap<String, String>) -> Self {
38        Self { url, headers }
39    }
40
41    pub async fn search(&self, query: &str, count: Option<usize>) -> Result<SearchResults> {
42        let client = reqwest::Client::new();
43
44        // Build query parameters
45        let params = vec![("q", query.to_string())];
46
47        crate::debug_log!(
48            "Jina: Making GET request to {} with params: {:?}",
49            self.url,
50            params
51        );
52
53        let mut request = client.get(&self.url).query(&params);
54
55        // Check if full content reading is enabled
56        let use_full_content = self.headers.contains_key("X-Engine")
57            && self
58                .headers
59                .get("X-Engine")
60                .map_or(false, |v| v == "direct");
61
62        // Add headers
63        for (name, value) in &self.headers {
64            if name == "Authorization" {
65                // For Authorization header, use Bearer format
66                request = request.header(name, format!("Bearer {}", value));
67                crate::debug_log!("Jina: Added Authorization header with Bearer token");
68            } else {
69                request = request.header(name, value);
70                crate::debug_log!("Jina: Added header {}: {}", name, value);
71            }
72        }
73
74        // Check if we should request JSON format
75        let want_json = self.headers.contains_key("Accept")
76            && self
77                .headers
78                .get("Accept")
79                .map_or(false, |v| v.contains("application/json"));
80
81        if use_full_content {
82            crate::debug_log!("Jina: Using X-Engine: direct for full content reading");
83        }
84
85        if want_json {
86            crate::debug_log!("Jina: Requesting JSON format");
87        } else {
88            crate::debug_log!("Jina: Requesting default text format");
89        }
90
91        let response = request.send().await?;
92
93        let status = response.status();
94        crate::debug_log!("Jina: Received response with status: {}", status);
95
96        if !status.is_success() {
97            let error_text = response.text().await.unwrap_or_default();
98            crate::debug_log!("Jina: Error response: {}", error_text);
99            anyhow::bail!("Jina request failed with status {}: {}", status, error_text);
100        }
101
102        let response_text = response.text().await?;
103        crate::debug_log!("Jina: Response body length: {} bytes", response_text.len());
104
105        let mut results = Vec::new();
106        let max_results = count.unwrap_or(10);
107
108        if want_json {
109            // Parse JSON response
110            crate::debug_log!("Jina: Parsing JSON response");
111
112            if use_full_content {
113                // When using X-Engine: direct, Jina returns a different JSON structure
114                crate::debug_log!("Jina: Parsing direct engine JSON response");
115                let direct_response: JinaDirectResponse = serde_json::from_str(&response_text)
116                    .map_err(|e| {
117                        anyhow::anyhow!("Failed to parse Jina direct JSON response: {}", e)
118                    })?;
119
120                for (index, result) in direct_response.data.iter().enumerate() {
121                    if index >= max_results {
122                        break;
123                    }
124
125                    let search_result = SearchResult {
126                        title: result.title.clone(),
127                        url: result.url.clone(),
128                        snippet: if !result.content.is_empty() {
129                            // With X-Engine: direct, content contains the full page content
130                            result.content.clone()
131                        } else if !result.description.is_empty() {
132                            result.description.clone()
133                        } else {
134                            "No content available".to_string()
135                        },
136                        published_date: None,
137                        author: None,
138                        score: None,
139                    };
140
141                    results.push(search_result);
142                }
143            } else {
144                // Standard JSON response format
145                let jina_results: Vec<JinaSearchResult> = serde_json::from_str(&response_text)
146                    .map_err(|e| anyhow::anyhow!("Failed to parse Jina JSON response: {}", e))?;
147
148                for (index, result) in jina_results.iter().enumerate() {
149                    if index >= max_results {
150                        break;
151                    }
152
153                    let search_result = SearchResult {
154                        title: result.title.clone(),
155                        url: result.url.clone(),
156                        snippet: if !result.description.is_empty() {
157                            result.description.clone()
158                        } else {
159                            result.content.clone()
160                        },
161                        published_date: None,
162                        author: None,
163                        score: None,
164                    };
165
166                    results.push(search_result);
167                }
168            }
169        } else {
170            // Parse text response format
171            crate::debug_log!("Jina: Parsing text response");
172            let lines: Vec<&str> = response_text.lines().collect();
173            let mut current_result: Option<(String, String, String)> = None; // (title, url, description)
174
175            for line in lines {
176                let line = line.trim();
177                if line.is_empty() {
178                    continue;
179                }
180
181                // Parse format like: [1] Title: Title text
182                if let Some(title_match) = line.strip_prefix("[") {
183                    if let Some(end_bracket) = title_match.find("] Title: ") {
184                        let title = title_match[end_bracket + 9..].to_string();
185                        if let Some((prev_title, prev_url, prev_desc)) = current_result.take() {
186                            // Save previous result
187                            if !prev_title.is_empty()
188                                && !prev_url.is_empty()
189                                && results.len() < max_results
190                            {
191                                results.push(SearchResult {
192                                    title: prev_title,
193                                    url: prev_url,
194                                    snippet: prev_desc,
195                                    published_date: None,
196                                    author: None,
197                                    score: None,
198                                });
199                            }
200                        }
201                        current_result = Some((title, String::new(), String::new()));
202                        continue;
203                    }
204                }
205
206                // Parse format like: [1] URL Source: https://example.com
207                if let Some(url_match) = line.strip_prefix("[") {
208                    if let Some(end_bracket) = url_match.find("] URL Source: ") {
209                        let url = url_match[end_bracket + 13..].to_string();
210                        if let Some((title, _, desc)) = current_result.take() {
211                            current_result = Some((title, url, desc));
212                        }
213                        continue;
214                    }
215                }
216
217                // Parse format like: [1] Description: Description text
218                if let Some(desc_match) = line.strip_prefix("[") {
219                    if let Some(end_bracket) = desc_match.find("] Description: ") {
220                        let description = desc_match[end_bracket + 15..].to_string();
221                        if let Some((title, url, _)) = current_result.take() {
222                            current_result = Some((title, url, description));
223                        }
224                        continue;
225                    }
226                }
227            }
228
229            // Don't forget the last result
230            if let Some((title, url, desc)) = current_result {
231                if !title.is_empty() && !url.is_empty() && results.len() < max_results {
232                    results.push(SearchResult {
233                        title,
234                        url,
235                        snippet: desc,
236                        published_date: None,
237                        author: None,
238                        score: None,
239                    });
240                }
241            }
242        }
243
244        crate::debug_log!("Jina: Successfully extracted {} results", results.len());
245
246        Ok(SearchResults {
247            query: query.to_string(),
248            provider: "Jina".to_string(),
249            results,
250            total_results: None,  // Jina doesn't provide total count
251            search_time_ms: None, // Jina doesn't provide timing info
252        })
253    }
254}
255
256/// Search function that matches the interface used by other providers
257pub async fn search(
258    provider_config: &super::SearchProviderConfig,
259    query: &str,
260    count: Option<usize>,
261) -> anyhow::Result<super::SearchResults> {
262    let provider = JinaProvider::new(provider_config.url.clone(), provider_config.headers.clone());
263
264    provider.search(query, count).await
265}