dociium/doc_engine/
scraper.rs

1//! Docs.rs scraper module for fetching documentation from docs.rs
2//!
3//! This module handles web scraping of pre-built documentation from docs.rs,
4//! replacing the previous approach of building rustdoc JSON locally.
5
6use anyhow::{anyhow, Context, Result};
7use reqwest::Client;
8use scraper::{Html, Selector};
9
10use std::time::Duration;
11use tracing::{debug, info, instrument, warn};
12
13use crate::doc_engine::types::{ItemDoc, SourceLocation};
14
15/// Docs.rs scraper for fetching documentation
16pub struct DocsRsScraper {
17    client: Client,
18    base_url: String,
19}
20
21use crate::doc_engine::types::{SearchIndexData, SearchIndexItem};
22
23/// Configuration for the docs.rs scraper
24#[derive(Debug, Clone)]
25pub struct ScraperConfig {
26    pub timeout: Duration,
27    pub max_retries: u32,
28    pub retry_delay: Duration,
29    pub user_agent: String,
30    pub head_timeout: Duration,
31    pub fetch_timeout: Duration,
32}
33
34impl Default for ScraperConfig {
35    fn default() -> Self {
36        Self {
37            timeout: Duration::from_secs(10),
38            max_retries: 2,
39            retry_delay: Duration::from_millis(500),
40            user_agent: "dociium-scraper/1.0".to_string(),
41            head_timeout: Duration::from_secs(5),
42            fetch_timeout: Duration::from_secs(10),
43        }
44    }
45}
46
47impl DocsRsScraper {
48    /// Create a new docs.rs scraper
49    pub fn new() -> Self {
50        Self::with_config(ScraperConfig::default())
51    }
52
53    /// Create a new docs.rs scraper with custom configuration
54    pub fn with_config(config: ScraperConfig) -> Self {
55        let client = Client::builder()
56            .timeout(config.timeout)
57            .user_agent(&config.user_agent)
58            .gzip(true)
59            .build()
60            .expect("Failed to create HTTP client for scraper");
61
62        Self {
63            client,
64            base_url: "https://docs.rs".to_string(),
65        }
66    }
67
68    /// Fetch and parse documentation for a specific item
69    #[instrument(skip(self), fields(crate_name = %crate_name, version = %version, item_path = %item_path))]
70    pub async fn fetch_item_doc(
71        &self,
72        crate_name: &str,
73        version: &str,
74        item_path: &str,
75    ) -> Result<ItemDoc> {
76        info!(
77            "Fetching documentation for {}@{}: {}",
78            crate_name, version, item_path
79        );
80
81        // Try to discover the correct URL for the item's documentation page
82        let url = self
83            .discover_item_url(crate_name, version, item_path)
84            .await?;
85        debug!("Fetching from URL: {}", url);
86
87        // Fetch the HTML content
88        let html_content = self.fetch_html(&url).await?;
89        let document = Html::parse_document(&html_content);
90
91        // Parse the documentation content
92        let item_doc = self.parse_item_documentation(&document, item_path)?;
93
94        info!("Successfully fetched documentation for {}", item_path);
95        Ok(item_doc)
96    }
97
98    /// Fetch and parse the search index for an entire crate
99    #[instrument(skip(self), fields(crate_name = %crate_name, version = %version))]
100    pub async fn fetch_search_index(
101        &self,
102        crate_name: &str,
103        version: &str,
104    ) -> Result<SearchIndexData> {
105        info!("Fetching search index for {}@{}", crate_name, version);
106
107        // Construct the URL for the search index
108        let url = format!(
109            "{}/{}/{}/search-index.js",
110            self.base_url, crate_name, version
111        );
112        debug!("Fetching search index from: {}", url);
113
114        // Fetch the JavaScript content
115        let js_content = self.fetch_text(&url).await?;
116
117        // Parse the search index data
118        let search_data = self.parse_search_index(&js_content, crate_name, version)?;
119
120        info!(
121            "Successfully fetched search index with {} items",
122            search_data.items.len()
123        );
124        Ok(search_data)
125    }
126
127    /// Check if documentation exists for a crate version
128    pub async fn check_docs_available(&self, crate_name: &str, version: &str) -> Result<bool> {
129        let url = format!(
130            "{}/{}/{}/{}/",
131            self.base_url,
132            crate_name,
133            version,
134            crate_name.replace('-', "_")
135        );
136
137        match self.client.head(&url).send().await {
138            Ok(response) => Ok(response.status().is_success()),
139            Err(_) => Ok(false),
140        }
141    }
142
143    /// Try to find the correct URL by checking multiple patterns
144    async fn discover_item_url(
145        &self,
146        crate_name: &str,
147        version: &str,
148        item_path: &str,
149    ) -> Result<String> {
150        let path_parts: Vec<&str> = item_path.split("::").collect();
151
152        if path_parts.is_empty() {
153            return Err(anyhow!("Empty item path"));
154        }
155
156        // Skip the crate name if it's the first component
157        let start_index = if path_parts.first() == Some(&crate_name) {
158            1
159        } else {
160            0
161        };
162        let relevant_parts = &path_parts[start_index..];
163
164        if relevant_parts.is_empty() {
165            return Err(anyhow!("No item name found in path"));
166        }
167
168        let item_name = relevant_parts.last().unwrap();
169        let module_path = if relevant_parts.len() > 1 {
170            relevant_parts[..relevant_parts.len() - 1].join("/")
171        } else {
172            String::new()
173        };
174
175        let crate_name_underscore = crate_name.replace('-', "_");
176
177        // Try different type prefixes in order of likelihood
178        let type_prefixes = [
179            "struct", "fn", "trait", "enum", "type", "macro", "constant", "static", "mod", "union",
180        ];
181
182        for prefix in &type_prefixes {
183            let file_name = format!("{prefix}.{item_name}.html");
184
185            let url = if module_path.is_empty() {
186                format!(
187                    "{}/{}/{}/{}/{}",
188                    self.base_url, crate_name, version, crate_name_underscore, file_name
189                )
190            } else {
191                format!(
192                    "{}/{}/{}/{}/{}/{}",
193                    self.base_url,
194                    crate_name,
195                    version,
196                    crate_name_underscore,
197                    module_path,
198                    file_name
199                )
200            };
201
202            // Check if this URL exists with timeout
203            match tokio::time::timeout(Duration::from_secs(5), self.client.head(&url).send()).await
204            {
205                Ok(Ok(response)) if response.status().is_success() => {
206                    debug!("Found valid URL: {}", url);
207                    return Ok(url);
208                }
209                Ok(Ok(_)) => {
210                    debug!("Non-success status for URL: {}", url);
211                    continue;
212                }
213                Ok(Err(e)) => {
214                    debug!("Network error for {}: {}", url, e);
215                    continue;
216                }
217                Err(_) => {
218                    debug!("Timeout for URL: {}", url);
219                    continue;
220                }
221            }
222        }
223
224        // Fallback - try without type prefix
225        let url = if module_path.is_empty() {
226            format!(
227                "{}/{}/{}/{}/{}.html",
228                self.base_url, crate_name, version, crate_name_underscore, item_name
229            )
230        } else {
231            format!(
232                "{}/{}/{}/{}/{}/{}.html",
233                self.base_url, crate_name, version, crate_name_underscore, module_path, item_name
234            )
235        };
236
237        match tokio::time::timeout(Duration::from_secs(5), self.client.head(&url).send()).await {
238            Ok(Ok(response)) if response.status().is_success() => Ok(url),
239            Ok(Ok(response)) => Err(anyhow!(
240                "Non-success status {} for fallback URL: {}",
241                response.status(),
242                url
243            )),
244            Ok(Err(e)) => Err(anyhow!("Network error for fallback URL {}: {}", url, e)),
245            Err(_) => Err(anyhow!(
246                "Timeout checking fallback URL for item: {}",
247                item_path
248            )),
249        }
250    }
251
252    /// Fetch HTML content from a URL with retries
253    async fn fetch_html(&self, url: &str) -> Result<String> {
254        self.fetch_text(url).await
255    }
256
257    /// Fetch text content from a URL with retries
258    async fn fetch_text(&self, url: &str) -> Result<String> {
259        let mut last_error = None;
260
261        for attempt in 1..=2 {
262            match tokio::time::timeout(Duration::from_secs(10), self.client.get(url).send()).await {
263                Ok(Ok(response)) => {
264                    if response.status().is_success() {
265                        match tokio::time::timeout(Duration::from_secs(10), response.text()).await {
266                            Ok(Ok(content)) => return Ok(content),
267                            Ok(Err(e)) => {
268                                warn!("Failed to read response body on attempt {}: {}", attempt, e);
269                                last_error = Some(anyhow!(e));
270                            }
271                            Err(_) => {
272                                warn!("Timeout reading response body on attempt {}", attempt);
273                                last_error = Some(anyhow!("Timeout reading response body"));
274                            }
275                        }
276                    } else if response.status().as_u16() == 404 {
277                        return Err(anyhow!("Documentation not found: {}", url));
278                    } else {
279                        last_error = Some(anyhow!("HTTP error: {}", response.status()));
280                        warn!("HTTP error on attempt {}: {}", attempt, response.status());
281                    }
282                }
283                Ok(Err(e)) => {
284                    warn!("Network error on attempt {}: {}", attempt, e);
285                    last_error = Some(anyhow!(e));
286                }
287                Err(_) => {
288                    warn!("Request timeout on attempt {}", attempt);
289                    last_error = Some(anyhow!("Request timeout"));
290                }
291            }
292
293            if attempt < 2 {
294                tokio::time::sleep(Duration::from_millis(500 * attempt as u64)).await;
295            }
296        }
297
298        Err(last_error.unwrap_or_else(|| anyhow!("Failed to fetch from {}", url)))
299    }
300
301    /// Parse item documentation from HTML document
302    fn parse_item_documentation(&self, document: &Html, item_path: &str) -> Result<ItemDoc> {
303        // Define CSS selectors for different parts of the documentation
304        let docblock_selector = Selector::parse("main .docblock").unwrap();
305        let signature_selector = Selector::parse(".code-header").unwrap();
306        let source_link_selector = Selector::parse(".src-link").unwrap();
307
308        // Extract the main documentation content
309        let rendered_markdown = document
310            .select(&docblock_selector)
311            .next()
312            .map(|elem| elem.inner_html())
313            .unwrap_or_else(|| "No documentation available.".to_string());
314
315        // Extract the signature/declaration
316        let signature = document
317            .select(&signature_selector)
318            .next()
319            .map(|elem| elem.text().collect::<Vec<_>>().join(" ").trim().to_string());
320
321        // Extract source location if available
322        let source_location = document
323            .select(&source_link_selector)
324            .next()
325            .and_then(|elem| elem.value().attr("href"))
326            .and_then(|href| self.parse_source_location(href).ok());
327
328        // Determine the item kind from the page structure
329        let kind = self.extract_item_kind(document, item_path);
330
331        // Extract visibility information
332        let visibility = self.extract_visibility(document);
333
334        // Extract attributes
335        let attributes = self.extract_attributes(document);
336
337        // Extract examples from documentation
338        let examples = self.extract_examples(document);
339
340        Ok(ItemDoc {
341            path: item_path.to_string(),
342            kind,
343            rendered_markdown,
344            source_location,
345            visibility,
346            attributes,
347            signature,
348            examples,
349            see_also: Vec::new(),
350        })
351    }
352
353    /// Parse search index JavaScript content
354    fn parse_search_index(
355        &self,
356        js_content: &str,
357        crate_name: &str,
358        version: &str,
359    ) -> Result<SearchIndexData> {
360        // Hardened extraction of docs.rs search-index.js content.
361        //
362        // Historical formats (observed):
363        //  1. var searchIndex = {"crate":{"items":[...],"paths":[...]}, ...};
364        //  2. searchIndex = {"crate":{"i":[...],"p":[...]}, ...};
365        //  3. self.searchIndex = {...};
366        //  4. window.searchIndex = {...};
367        //
368        // Risks:
369        //  - Naïve first/last brace slice can include trailing loader code or miss if
370        //    braces appear in a banner comment.
371        //  - Future minification may rename top variable but internal crate map
372        //    stays JSON-like.
373        //
374        // Strategy:
375        //  1. Attempt targeted regex captures around common assignment patterns.
376        //  2. If unsuccessful, perform a brace-balanced extraction starting at the
377        //     first occurrence of the crate key (crate or crate with '_' instead of '-').
378        //  3. Parse as JSON; locate crate entry under either original or sanitized key.
379        //
380        // NOTE: ETag / backoff integration is handled at the fetch layer; this parser
381        // remains stateless but surfaces granular errors to allow upstream retry/
382        // classification (e.g. distinguishing "structure changed" vs "crate missing").
383        use regex::Regex;
384        use serde_json::Value;
385
386        let crate_key_alt = crate_name.replace('-', "_");
387        let candidate_keys = [crate_name, crate_key_alt.as_str()];
388
389        // Helper: try parsing a JSON candidate string into Value and fetch crate map.
390        let try_parse = |json_str: &str| -> Result<(Value, Value)> {
391            let v: Value =
392                serde_json::from_str(json_str).context("Failed to parse search index JSON")?;
393            for key in candidate_keys {
394                if let Some(entry) = v.get(key) {
395                    return Ok((v.clone(), entry.clone()));
396                }
397            }
398            Err(anyhow!(
399                "Crate data not found in parsed search index object (keys tried: {:?})",
400                candidate_keys
401            ))
402        };
403
404        // 1. Regex-based captures (non-greedy with manual brace balance after initial '{').
405        let regex_patterns = [
406            r#"(?s)searchIndex\s*=\s*(\{.*\});"#,
407            r#"(?s)var\s+searchIndex\s*=\s*(\{.*\});"#,
408            r#"(?s)self\.searchIndex\s*=\s*(\{.*\});"#,
409            r#"(?s)window\.searchIndex\s*=\s*(\{.*\});"#,
410        ];
411
412        for pat in regex_patterns {
413            if let Ok(re) = Regex::new(pat) {
414                if let Some(caps) = re.captures(js_content) {
415                    let blob = caps.get(1).map(|m| m.as_str()).unwrap_or("");
416                    // Perform brace balance to trim trailing over-capture.
417                    if let Some(json_balanced) = Self::balanced_brace_slice(blob) {
418                        if let Ok((json_data, crate_data)) = try_parse(&json_balanced) {
419                            return self.build_search_index(
420                                crate_name,
421                                version,
422                                &crate_data,
423                                &json_data,
424                            );
425                        }
426                    }
427                }
428            }
429        }
430
431        // 2. Fallback: locate crate key and backtrack to opening brace, then balance.
432        let mut fallback_extracted: Option<String> = None;
433        for key in candidate_keys {
434            if let Some(pos) = js_content.find(&format!("\"{key}\"")) {
435                // Backtrack to nearest '{'
436                if let Some(start) = js_content[..pos].rfind('{') {
437                    if let Some(json_balanced) = Self::balanced_brace_slice(&js_content[start..]) {
438                        fallback_extracted = Some(json_balanced);
439                        break;
440                    }
441                }
442            }
443        }
444
445        if let Some(json_blob) = fallback_extracted {
446            if let Ok((json_data, crate_data)) = try_parse(&json_blob) {
447                return self.build_search_index(crate_name, version, &crate_data, &json_data);
448            }
449        }
450
451        Err(anyhow!(
452            "Unable to extract or parse search index for crate '{}'",
453            crate_name
454        ))
455    }
456
457    /// Extract a balanced JSON slice from a string starting at the first '{'.
458    /// Returns None if braces cannot be balanced.
459    fn balanced_brace_slice(input: &str) -> Option<String> {
460        let bytes = input.as_bytes();
461        let mut depth = 0usize;
462        let mut started = false;
463        for (i, &b) in bytes.iter().enumerate() {
464            if b == b'{' {
465                depth += 1;
466                started = true;
467            } else if b == b'}' {
468                if depth == 0 {
469                    return None;
470                }
471                depth -= 1;
472                if depth == 0 {
473                    return Some(String::from_utf8_lossy(&bytes[..=i]).to_string());
474                }
475            }
476        }
477        if started && depth == 0 {
478            Some(String::from_utf8_lossy(bytes).to_string())
479        } else {
480            None
481        }
482    }
483
484    /// Build the strongly typed SearchIndexData from the extracted JSON subtree.
485    fn build_search_index(
486        &self,
487        crate_name: &str,
488        version: &str,
489        crate_data: &serde_json::Value,
490        root_json: &serde_json::Value,
491    ) -> Result<SearchIndexData> {
492        let _ = root_json; // reserved for future structure / schema validation
493        let items_array = crate_data
494            .get("items")
495            .or_else(|| crate_data.get("i"))
496            .and_then(|v| v.as_array())
497            .ok_or_else(|| anyhow!("Items array not found in crate data"))?;
498
499        let mut items = Vec::new();
500        let mut paths = Vec::new();
501
502        for item_value in items_array {
503            if let Some(item_array) = item_value.as_array() {
504                if item_array.len() >= 4 {
505                    let kind = self.kind_id_to_string(item_array[0].as_u64().unwrap_or(0) as usize);
506                    let name = item_array[1].as_str().unwrap_or("").to_string();
507                    let path = item_array[2].as_str().unwrap_or("").to_string();
508                    let description = item_array[3].as_str().unwrap_or("").to_string();
509                    let parent_index = item_array
510                        .get(4)
511                        .and_then(|v| v.as_array())
512                        .and_then(|arr| arr.first())
513                        .and_then(|v| v.as_u64())
514                        .map(|v| v as usize);
515
516                    items.push(SearchIndexItem {
517                        name,
518                        kind,
519                        path,
520                        description,
521                        parent_index,
522                    });
523                }
524            }
525        }
526
527        if let Some(paths_array) = crate_data.get("paths").or_else(|| crate_data.get("p")) {
528            if let Some(paths_arr) = paths_array.as_array() {
529                for path_value in paths_arr {
530                    if let Some(path_str) = path_value.as_str() {
531                        paths.push(path_str.to_string());
532                    }
533                }
534            }
535        }
536
537        Ok(SearchIndexData {
538            crate_name: crate_name.to_string(),
539            version: version.to_string(),
540            items,
541            paths,
542        })
543    }
544
545    /// Convert kind ID to string representation
546    fn kind_id_to_string(&self, kind_id: usize) -> String {
547        match kind_id {
548            0 => "module".to_string(),
549            1 => "extern_crate".to_string(),
550            2 => "import".to_string(),
551            3 => "struct".to_string(),
552            4 => "enum".to_string(),
553            5 => "function".to_string(),
554            6 => "type_def".to_string(),
555            7 => "static".to_string(),
556            8 => "trait".to_string(),
557            9 => "impl".to_string(),
558            10 => "tymethod".to_string(),
559            11 => "method".to_string(),
560            12 => "structfield".to_string(),
561            13 => "variant".to_string(),
562            14 => "macro".to_string(),
563            15 => "primitive".to_string(),
564            16 => "assoc_type".to_string(),
565            17 => "constant".to_string(),
566            18 => "assoc_const".to_string(),
567            19 => "union".to_string(),
568            20 => "foreign_type".to_string(),
569            21 => "keyword".to_string(),
570            22 => "existential".to_string(),
571            23 => "attr".to_string(),
572            24 => "derive".to_string(),
573            25 => "trait_alias".to_string(),
574            _ => format!("unknown_{kind_id}"),
575        }
576    }
577
578    /// Extract item kind from HTML document
579    fn extract_item_kind(&self, document: &Html, _item_path: &str) -> String {
580        // Look for indicators in the HTML structure
581        let title_selector = Selector::parse("h1.main-heading").unwrap();
582
583        if let Some(title_elem) = document.select(&title_selector).next() {
584            let title_text = title_elem.text().collect::<String>();
585
586            if title_text.contains("Struct") {
587                return "struct".to_string();
588            } else if title_text.contains("Enum") {
589                return "enum".to_string();
590            } else if title_text.contains("Trait") {
591                return "trait".to_string();
592            } else if title_text.contains("Function") {
593                return "function".to_string();
594            } else if title_text.contains("Module") {
595                return "module".to_string();
596            } else if title_text.contains("Constant") {
597                return "constant".to_string();
598            } else if title_text.contains("Type") {
599                return "type_def".to_string();
600            } else if title_text.contains("Macro") {
601                return "macro".to_string();
602            }
603        }
604
605        "unknown".to_string()
606    }
607
608    /// Extract visibility information from HTML document
609    fn extract_visibility(&self, document: &Html) -> String {
610        let code_header_selector = Selector::parse(".code-header").unwrap();
611
612        if let Some(header_elem) = document.select(&code_header_selector).next() {
613            let header_text = header_elem.text().collect::<String>();
614
615            if header_text.contains("pub") {
616                return "public".to_string();
617            }
618        }
619
620        "private".to_string()
621    }
622
623    /// Extract attributes from HTML document
624    fn extract_attributes(&self, document: &Html) -> Vec<String> {
625        let mut attributes = Vec::new();
626
627        // Look for attribute annotations in the code header
628        let code_header_selector = Selector::parse(".code-header").unwrap();
629
630        if let Some(header_elem) = document.select(&code_header_selector).next() {
631            let header_text = header_elem.inner_html();
632
633            // Look for common attributes
634            if header_text.contains("#[derive") {
635                attributes.push("derive".to_string());
636            }
637            if header_text.contains("#[cfg") {
638                attributes.push("cfg".to_string());
639            }
640            if header_text.contains("#[deprecated") {
641                attributes.push("deprecated".to_string());
642            }
643        }
644
645        attributes
646    }
647
648    /// Extract code examples from documentation
649    fn extract_examples(&self, document: &Html) -> Vec<String> {
650        let mut examples = Vec::new();
651
652        let example_selector = Selector::parse(".docblock pre code").unwrap();
653
654        for example_elem in document.select(&example_selector) {
655            let example_text = example_elem.text().collect::<String>();
656            if !example_text.trim().is_empty() {
657                examples.push(example_text);
658            }
659        }
660
661        examples
662    }
663
664    /// Parse source location from source link href
665    fn parse_source_location(&self, href: &str) -> Result<SourceLocation> {
666        // Source links are typically in format: /src/crate/path/file.rs.html#L123-456
667
668        // Extract the file path
669        let file_start = href.find("/src/").unwrap_or(0) + 5;
670        let file_end = href.find(".html").unwrap_or(href.len());
671        let file_path = &href[file_start..file_end];
672
673        // Extract line numbers from fragment
674        let mut line = 1u32;
675        let mut end_line = None;
676
677        if let Some(fragment_start) = href.find('#') {
678            let fragment = &href[fragment_start + 1..];
679            if let Some(line_part) = fragment.strip_prefix('L') {
680                if let Some(dash_pos) = line_part.find('-') {
681                    // Range like L123-456
682                    if let Ok(start) = line_part[..dash_pos].parse::<u32>() {
683                        line = start;
684                        if let Ok(end) = line_part[dash_pos + 1..].parse::<u32>() {
685                            end_line = Some(end);
686                        }
687                    }
688                } else {
689                    // Single line like L123
690                    if let Ok(single_line) = line_part.parse::<u32>() {
691                        line = single_line;
692                    }
693                }
694            }
695        }
696
697        Ok(SourceLocation {
698            file: file_path.to_string(),
699            line,
700            column: 1,
701            end_line,
702            end_column: None,
703        })
704    }
705}
706
707impl Default for DocsRsScraper {
708    fn default() -> Self {
709        Self::new()
710    }
711}
712
713#[cfg(test)]
714mod tests {
715    use super::*;
716
717    #[test]
718    fn test_scraper_creation() {
719        let _scraper = DocsRsScraper::new();
720        // Just verify we can create the scraper without panicking
721    }
722
723    #[test]
724    fn test_kind_id_conversion() {
725        let scraper = DocsRsScraper::new();
726
727        assert_eq!(scraper.kind_id_to_string(3), "struct");
728        assert_eq!(scraper.kind_id_to_string(5), "function");
729        assert_eq!(scraper.kind_id_to_string(8), "trait");
730        assert_eq!(scraper.kind_id_to_string(999), "unknown_999");
731    }
732
733    #[test]
734    fn test_parse_source_location() {
735        let scraper = DocsRsScraper::new();
736
737        let href = "/src/serde/lib.rs.html#L123-456";
738        let location = scraper.parse_source_location(href).unwrap();
739
740        assert_eq!(location.file, "serde/lib.rs");
741        assert_eq!(location.line, 123);
742        assert_eq!(location.end_line, Some(456));
743    }
744
745    #[tokio::test]
746    #[cfg(feature = "network-tests")]
747    async fn test_discover_item_url() {
748        let scraper = DocsRsScraper::new();
749
750        // Test with a real struct that should exist
751        match scraper
752            .discover_item_url("tokio", "latest", "sync::Mutex")
753            .await
754        {
755            Ok(url) => {
756                assert!(url.contains("docs.rs/tokio"));
757                assert!(url.contains("sync"));
758                assert!(url.contains("Mutex"));
759            }
760            Err(e) => {
761                // This might fail due to network issues or docs.rs structure changes
762                println!(
763                    "URL discovery test failed (expected in some environments): {}",
764                    e
765                );
766            }
767        }
768    }
769
770    #[tokio::test]
771    #[cfg(feature = "network-tests")]
772    async fn test_fetch_search_index() {
773        let scraper = DocsRsScraper::new();
774
775        // Test fetching search index for a known crate
776        match scraper.fetch_search_index("serde", "1.0.0").await {
777            Ok(search_data) => {
778                assert_eq!(search_data.crate_name, "serde");
779                assert!(!search_data.items.is_empty());
780            }
781            Err(e) => {
782                // This might fail due to network issues or docs.rs structure changes
783                println!(
784                    "Search index test failed (expected in some environments): {}",
785                    e
786                );
787            }
788        }
789    }
790}
dociium/doc_engine/scraper.rs

dociium/doc_engine/
scraper.rs