1use anyhow::{anyhow, Context, Result};
7use reqwest::Client;
8use scraper::{Html, Selector};
9
10use std::time::Duration;
11use tracing::{debug, info, instrument, warn};
12
13use crate::doc_engine::types::{ItemDoc, SourceLocation};
14
15pub struct DocsRsScraper {
17 client: Client,
18 base_url: String,
19}
20
21use crate::doc_engine::types::{SearchIndexData, SearchIndexItem};
22
23#[derive(Debug, Clone)]
25pub struct ScraperConfig {
26 pub timeout: Duration,
27 pub max_retries: u32,
28 pub retry_delay: Duration,
29 pub user_agent: String,
30 pub head_timeout: Duration,
31 pub fetch_timeout: Duration,
32}
33
34impl Default for ScraperConfig {
35 fn default() -> Self {
36 Self {
37 timeout: Duration::from_secs(10),
38 max_retries: 2,
39 retry_delay: Duration::from_millis(500),
40 user_agent: "dociium-scraper/1.0".to_string(),
41 head_timeout: Duration::from_secs(5),
42 fetch_timeout: Duration::from_secs(10),
43 }
44 }
45}
46
47impl DocsRsScraper {
48 pub fn new() -> Self {
50 Self::with_config(ScraperConfig::default())
51 }
52
53 pub fn with_config(config: ScraperConfig) -> Self {
55 let client = Client::builder()
56 .timeout(config.timeout)
57 .user_agent(&config.user_agent)
58 .gzip(true)
59 .build()
60 .expect("Failed to create HTTP client for scraper");
61
62 Self {
63 client,
64 base_url: "https://docs.rs".to_string(),
65 }
66 }
67
68 #[instrument(skip(self), fields(crate_name = %crate_name, version = %version, item_path = %item_path))]
70 pub async fn fetch_item_doc(
71 &self,
72 crate_name: &str,
73 version: &str,
74 item_path: &str,
75 ) -> Result<ItemDoc> {
76 info!(
77 "Fetching documentation for {}@{}: {}",
78 crate_name, version, item_path
79 );
80
81 let url = self
83 .discover_item_url(crate_name, version, item_path)
84 .await?;
85 debug!("Fetching from URL: {}", url);
86
87 let html_content = self.fetch_html(&url).await?;
89 let document = Html::parse_document(&html_content);
90
91 let item_doc = self.parse_item_documentation(&document, item_path)?;
93
94 info!("Successfully fetched documentation for {}", item_path);
95 Ok(item_doc)
96 }
97
98 #[instrument(skip(self), fields(crate_name = %crate_name, version = %version))]
100 pub async fn fetch_search_index(
101 &self,
102 crate_name: &str,
103 version: &str,
104 ) -> Result<SearchIndexData> {
105 info!("Fetching search index for {}@{}", crate_name, version);
106
107 let url = format!(
109 "{}/{}/{}/search-index.js",
110 self.base_url, crate_name, version
111 );
112 debug!("Fetching search index from: {}", url);
113
114 let js_content = self.fetch_text(&url).await?;
116
117 let search_data = self.parse_search_index(&js_content, crate_name, version)?;
119
120 info!(
121 "Successfully fetched search index with {} items",
122 search_data.items.len()
123 );
124 Ok(search_data)
125 }
126
127 pub async fn check_docs_available(&self, crate_name: &str, version: &str) -> Result<bool> {
129 let url = format!(
130 "{}/{}/{}/{}/",
131 self.base_url,
132 crate_name,
133 version,
134 crate_name.replace('-', "_")
135 );
136
137 match self.client.head(&url).send().await {
138 Ok(response) => Ok(response.status().is_success()),
139 Err(_) => Ok(false),
140 }
141 }
142
143 async fn discover_item_url(
145 &self,
146 crate_name: &str,
147 version: &str,
148 item_path: &str,
149 ) -> Result<String> {
150 let path_parts: Vec<&str> = item_path.split("::").collect();
151
152 if path_parts.is_empty() {
153 return Err(anyhow!("Empty item path"));
154 }
155
156 let start_index = if path_parts.first() == Some(&crate_name) {
158 1
159 } else {
160 0
161 };
162 let relevant_parts = &path_parts[start_index..];
163
164 if relevant_parts.is_empty() {
165 return Err(anyhow!("No item name found in path"));
166 }
167
168 let item_name = relevant_parts.last().unwrap();
169 let module_path = if relevant_parts.len() > 1 {
170 relevant_parts[..relevant_parts.len() - 1].join("/")
171 } else {
172 String::new()
173 };
174
175 let crate_name_underscore = crate_name.replace('-', "_");
176
177 let type_prefixes = [
179 "struct", "fn", "trait", "enum", "type", "macro", "constant", "static", "mod", "union",
180 ];
181
182 for prefix in &type_prefixes {
183 let file_name = format!("{prefix}.{item_name}.html");
184
185 let url = if module_path.is_empty() {
186 format!(
187 "{}/{}/{}/{}/{}",
188 self.base_url, crate_name, version, crate_name_underscore, file_name
189 )
190 } else {
191 format!(
192 "{}/{}/{}/{}/{}/{}",
193 self.base_url,
194 crate_name,
195 version,
196 crate_name_underscore,
197 module_path,
198 file_name
199 )
200 };
201
202 match tokio::time::timeout(Duration::from_secs(5), self.client.head(&url).send()).await
204 {
205 Ok(Ok(response)) if response.status().is_success() => {
206 debug!("Found valid URL: {}", url);
207 return Ok(url);
208 }
209 Ok(Ok(_)) => {
210 debug!("Non-success status for URL: {}", url);
211 continue;
212 }
213 Ok(Err(e)) => {
214 debug!("Network error for {}: {}", url, e);
215 continue;
216 }
217 Err(_) => {
218 debug!("Timeout for URL: {}", url);
219 continue;
220 }
221 }
222 }
223
224 let url = if module_path.is_empty() {
226 format!(
227 "{}/{}/{}/{}/{}.html",
228 self.base_url, crate_name, version, crate_name_underscore, item_name
229 )
230 } else {
231 format!(
232 "{}/{}/{}/{}/{}/{}.html",
233 self.base_url, crate_name, version, crate_name_underscore, module_path, item_name
234 )
235 };
236
237 match tokio::time::timeout(Duration::from_secs(5), self.client.head(&url).send()).await {
238 Ok(Ok(response)) if response.status().is_success() => Ok(url),
239 Ok(Ok(response)) => Err(anyhow!(
240 "Non-success status {} for fallback URL: {}",
241 response.status(),
242 url
243 )),
244 Ok(Err(e)) => Err(anyhow!("Network error for fallback URL {}: {}", url, e)),
245 Err(_) => Err(anyhow!(
246 "Timeout checking fallback URL for item: {}",
247 item_path
248 )),
249 }
250 }
251
252 async fn fetch_html(&self, url: &str) -> Result<String> {
254 self.fetch_text(url).await
255 }
256
257 async fn fetch_text(&self, url: &str) -> Result<String> {
259 let mut last_error = None;
260
261 for attempt in 1..=2 {
262 match tokio::time::timeout(Duration::from_secs(10), self.client.get(url).send()).await {
263 Ok(Ok(response)) => {
264 if response.status().is_success() {
265 match tokio::time::timeout(Duration::from_secs(10), response.text()).await {
266 Ok(Ok(content)) => return Ok(content),
267 Ok(Err(e)) => {
268 warn!("Failed to read response body on attempt {}: {}", attempt, e);
269 last_error = Some(anyhow!(e));
270 }
271 Err(_) => {
272 warn!("Timeout reading response body on attempt {}", attempt);
273 last_error = Some(anyhow!("Timeout reading response body"));
274 }
275 }
276 } else if response.status().as_u16() == 404 {
277 return Err(anyhow!("Documentation not found: {}", url));
278 } else {
279 last_error = Some(anyhow!("HTTP error: {}", response.status()));
280 warn!("HTTP error on attempt {}: {}", attempt, response.status());
281 }
282 }
283 Ok(Err(e)) => {
284 warn!("Network error on attempt {}: {}", attempt, e);
285 last_error = Some(anyhow!(e));
286 }
287 Err(_) => {
288 warn!("Request timeout on attempt {}", attempt);
289 last_error = Some(anyhow!("Request timeout"));
290 }
291 }
292
293 if attempt < 2 {
294 tokio::time::sleep(Duration::from_millis(500 * attempt as u64)).await;
295 }
296 }
297
298 Err(last_error.unwrap_or_else(|| anyhow!("Failed to fetch from {}", url)))
299 }
300
301 fn parse_item_documentation(&self, document: &Html, item_path: &str) -> Result<ItemDoc> {
303 let docblock_selector = Selector::parse("main .docblock").unwrap();
305 let signature_selector = Selector::parse(".code-header").unwrap();
306 let source_link_selector = Selector::parse(".src-link").unwrap();
307
308 let rendered_markdown = document
310 .select(&docblock_selector)
311 .next()
312 .map(|elem| elem.inner_html())
313 .unwrap_or_else(|| "No documentation available.".to_string());
314
315 let signature = document
317 .select(&signature_selector)
318 .next()
319 .map(|elem| elem.text().collect::<Vec<_>>().join(" ").trim().to_string());
320
321 let source_location = document
323 .select(&source_link_selector)
324 .next()
325 .and_then(|elem| elem.value().attr("href"))
326 .and_then(|href| self.parse_source_location(href).ok());
327
328 let kind = self.extract_item_kind(document, item_path);
330
331 let visibility = self.extract_visibility(document);
333
334 let attributes = self.extract_attributes(document);
336
337 let examples = self.extract_examples(document);
339
340 Ok(ItemDoc {
341 path: item_path.to_string(),
342 kind,
343 rendered_markdown,
344 source_location,
345 visibility,
346 attributes,
347 signature,
348 examples,
349 see_also: Vec::new(),
350 })
351 }
352
353 fn parse_search_index(
355 &self,
356 js_content: &str,
357 crate_name: &str,
358 version: &str,
359 ) -> Result<SearchIndexData> {
360 use regex::Regex;
384 use serde_json::Value;
385
386 let crate_key_alt = crate_name.replace('-', "_");
387 let candidate_keys = [crate_name, crate_key_alt.as_str()];
388
389 let try_parse = |json_str: &str| -> Result<(Value, Value)> {
391 let v: Value =
392 serde_json::from_str(json_str).context("Failed to parse search index JSON")?;
393 for key in candidate_keys {
394 if let Some(entry) = v.get(key) {
395 return Ok((v.clone(), entry.clone()));
396 }
397 }
398 Err(anyhow!(
399 "Crate data not found in parsed search index object (keys tried: {:?})",
400 candidate_keys
401 ))
402 };
403
404 let regex_patterns = [
406 r#"(?s)searchIndex\s*=\s*(\{.*\});"#,
407 r#"(?s)var\s+searchIndex\s*=\s*(\{.*\});"#,
408 r#"(?s)self\.searchIndex\s*=\s*(\{.*\});"#,
409 r#"(?s)window\.searchIndex\s*=\s*(\{.*\});"#,
410 ];
411
412 for pat in regex_patterns {
413 if let Ok(re) = Regex::new(pat) {
414 if let Some(caps) = re.captures(js_content) {
415 let blob = caps.get(1).map(|m| m.as_str()).unwrap_or("");
416 if let Some(json_balanced) = Self::balanced_brace_slice(blob) {
418 if let Ok((json_data, crate_data)) = try_parse(&json_balanced) {
419 return self.build_search_index(
420 crate_name,
421 version,
422 &crate_data,
423 &json_data,
424 );
425 }
426 }
427 }
428 }
429 }
430
431 let mut fallback_extracted: Option<String> = None;
433 for key in candidate_keys {
434 if let Some(pos) = js_content.find(&format!("\"{key}\"")) {
435 if let Some(start) = js_content[..pos].rfind('{') {
437 if let Some(json_balanced) = Self::balanced_brace_slice(&js_content[start..]) {
438 fallback_extracted = Some(json_balanced);
439 break;
440 }
441 }
442 }
443 }
444
445 if let Some(json_blob) = fallback_extracted {
446 if let Ok((json_data, crate_data)) = try_parse(&json_blob) {
447 return self.build_search_index(crate_name, version, &crate_data, &json_data);
448 }
449 }
450
451 Err(anyhow!(
452 "Unable to extract or parse search index for crate '{}'",
453 crate_name
454 ))
455 }
456
457 fn balanced_brace_slice(input: &str) -> Option<String> {
460 let bytes = input.as_bytes();
461 let mut depth = 0usize;
462 let mut started = false;
463 for (i, &b) in bytes.iter().enumerate() {
464 if b == b'{' {
465 depth += 1;
466 started = true;
467 } else if b == b'}' {
468 if depth == 0 {
469 return None;
470 }
471 depth -= 1;
472 if depth == 0 {
473 return Some(String::from_utf8_lossy(&bytes[..=i]).to_string());
474 }
475 }
476 }
477 if started && depth == 0 {
478 Some(String::from_utf8_lossy(bytes).to_string())
479 } else {
480 None
481 }
482 }
483
484 fn build_search_index(
486 &self,
487 crate_name: &str,
488 version: &str,
489 crate_data: &serde_json::Value,
490 root_json: &serde_json::Value,
491 ) -> Result<SearchIndexData> {
492 let _ = root_json; let items_array = crate_data
494 .get("items")
495 .or_else(|| crate_data.get("i"))
496 .and_then(|v| v.as_array())
497 .ok_or_else(|| anyhow!("Items array not found in crate data"))?;
498
499 let mut items = Vec::new();
500 let mut paths = Vec::new();
501
502 for item_value in items_array {
503 if let Some(item_array) = item_value.as_array() {
504 if item_array.len() >= 4 {
505 let kind = self.kind_id_to_string(item_array[0].as_u64().unwrap_or(0) as usize);
506 let name = item_array[1].as_str().unwrap_or("").to_string();
507 let path = item_array[2].as_str().unwrap_or("").to_string();
508 let description = item_array[3].as_str().unwrap_or("").to_string();
509 let parent_index = item_array
510 .get(4)
511 .and_then(|v| v.as_array())
512 .and_then(|arr| arr.first())
513 .and_then(|v| v.as_u64())
514 .map(|v| v as usize);
515
516 items.push(SearchIndexItem {
517 name,
518 kind,
519 path,
520 description,
521 parent_index,
522 });
523 }
524 }
525 }
526
527 if let Some(paths_array) = crate_data.get("paths").or_else(|| crate_data.get("p")) {
528 if let Some(paths_arr) = paths_array.as_array() {
529 for path_value in paths_arr {
530 if let Some(path_str) = path_value.as_str() {
531 paths.push(path_str.to_string());
532 }
533 }
534 }
535 }
536
537 Ok(SearchIndexData {
538 crate_name: crate_name.to_string(),
539 version: version.to_string(),
540 items,
541 paths,
542 })
543 }
544
545 fn kind_id_to_string(&self, kind_id: usize) -> String {
547 match kind_id {
548 0 => "module".to_string(),
549 1 => "extern_crate".to_string(),
550 2 => "import".to_string(),
551 3 => "struct".to_string(),
552 4 => "enum".to_string(),
553 5 => "function".to_string(),
554 6 => "type_def".to_string(),
555 7 => "static".to_string(),
556 8 => "trait".to_string(),
557 9 => "impl".to_string(),
558 10 => "tymethod".to_string(),
559 11 => "method".to_string(),
560 12 => "structfield".to_string(),
561 13 => "variant".to_string(),
562 14 => "macro".to_string(),
563 15 => "primitive".to_string(),
564 16 => "assoc_type".to_string(),
565 17 => "constant".to_string(),
566 18 => "assoc_const".to_string(),
567 19 => "union".to_string(),
568 20 => "foreign_type".to_string(),
569 21 => "keyword".to_string(),
570 22 => "existential".to_string(),
571 23 => "attr".to_string(),
572 24 => "derive".to_string(),
573 25 => "trait_alias".to_string(),
574 _ => format!("unknown_{kind_id}"),
575 }
576 }
577
578 fn extract_item_kind(&self, document: &Html, _item_path: &str) -> String {
580 let title_selector = Selector::parse("h1.main-heading").unwrap();
582
583 if let Some(title_elem) = document.select(&title_selector).next() {
584 let title_text = title_elem.text().collect::<String>();
585
586 if title_text.contains("Struct") {
587 return "struct".to_string();
588 } else if title_text.contains("Enum") {
589 return "enum".to_string();
590 } else if title_text.contains("Trait") {
591 return "trait".to_string();
592 } else if title_text.contains("Function") {
593 return "function".to_string();
594 } else if title_text.contains("Module") {
595 return "module".to_string();
596 } else if title_text.contains("Constant") {
597 return "constant".to_string();
598 } else if title_text.contains("Type") {
599 return "type_def".to_string();
600 } else if title_text.contains("Macro") {
601 return "macro".to_string();
602 }
603 }
604
605 "unknown".to_string()
606 }
607
608 fn extract_visibility(&self, document: &Html) -> String {
610 let code_header_selector = Selector::parse(".code-header").unwrap();
611
612 if let Some(header_elem) = document.select(&code_header_selector).next() {
613 let header_text = header_elem.text().collect::<String>();
614
615 if header_text.contains("pub") {
616 return "public".to_string();
617 }
618 }
619
620 "private".to_string()
621 }
622
623 fn extract_attributes(&self, document: &Html) -> Vec<String> {
625 let mut attributes = Vec::new();
626
627 let code_header_selector = Selector::parse(".code-header").unwrap();
629
630 if let Some(header_elem) = document.select(&code_header_selector).next() {
631 let header_text = header_elem.inner_html();
632
633 if header_text.contains("#[derive") {
635 attributes.push("derive".to_string());
636 }
637 if header_text.contains("#[cfg") {
638 attributes.push("cfg".to_string());
639 }
640 if header_text.contains("#[deprecated") {
641 attributes.push("deprecated".to_string());
642 }
643 }
644
645 attributes
646 }
647
648 fn extract_examples(&self, document: &Html) -> Vec<String> {
650 let mut examples = Vec::new();
651
652 let example_selector = Selector::parse(".docblock pre code").unwrap();
653
654 for example_elem in document.select(&example_selector) {
655 let example_text = example_elem.text().collect::<String>();
656 if !example_text.trim().is_empty() {
657 examples.push(example_text);
658 }
659 }
660
661 examples
662 }
663
664 fn parse_source_location(&self, href: &str) -> Result<SourceLocation> {
666 let file_start = href.find("/src/").unwrap_or(0) + 5;
670 let file_end = href.find(".html").unwrap_or(href.len());
671 let file_path = &href[file_start..file_end];
672
673 let mut line = 1u32;
675 let mut end_line = None;
676
677 if let Some(fragment_start) = href.find('#') {
678 let fragment = &href[fragment_start + 1..];
679 if let Some(line_part) = fragment.strip_prefix('L') {
680 if let Some(dash_pos) = line_part.find('-') {
681 if let Ok(start) = line_part[..dash_pos].parse::<u32>() {
683 line = start;
684 if let Ok(end) = line_part[dash_pos + 1..].parse::<u32>() {
685 end_line = Some(end);
686 }
687 }
688 } else {
689 if let Ok(single_line) = line_part.parse::<u32>() {
691 line = single_line;
692 }
693 }
694 }
695 }
696
697 Ok(SourceLocation {
698 file: file_path.to_string(),
699 line,
700 column: 1,
701 end_line,
702 end_column: None,
703 })
704 }
705}
706
707impl Default for DocsRsScraper {
708 fn default() -> Self {
709 Self::new()
710 }
711}
712
713#[cfg(test)]
714mod tests {
715 use super::*;
716
717 #[test]
718 fn test_scraper_creation() {
719 let _scraper = DocsRsScraper::new();
720 }
722
723 #[test]
724 fn test_kind_id_conversion() {
725 let scraper = DocsRsScraper::new();
726
727 assert_eq!(scraper.kind_id_to_string(3), "struct");
728 assert_eq!(scraper.kind_id_to_string(5), "function");
729 assert_eq!(scraper.kind_id_to_string(8), "trait");
730 assert_eq!(scraper.kind_id_to_string(999), "unknown_999");
731 }
732
733 #[test]
734 fn test_parse_source_location() {
735 let scraper = DocsRsScraper::new();
736
737 let href = "/src/serde/lib.rs.html#L123-456";
738 let location = scraper.parse_source_location(href).unwrap();
739
740 assert_eq!(location.file, "serde/lib.rs");
741 assert_eq!(location.line, 123);
742 assert_eq!(location.end_line, Some(456));
743 }
744
745 #[tokio::test]
746 #[cfg(feature = "network-tests")]
747 async fn test_discover_item_url() {
748 let scraper = DocsRsScraper::new();
749
750 match scraper
752 .discover_item_url("tokio", "latest", "sync::Mutex")
753 .await
754 {
755 Ok(url) => {
756 assert!(url.contains("docs.rs/tokio"));
757 assert!(url.contains("sync"));
758 assert!(url.contains("Mutex"));
759 }
760 Err(e) => {
761 println!(
763 "URL discovery test failed (expected in some environments): {}",
764 e
765 );
766 }
767 }
768 }
769
770 #[tokio::test]
771 #[cfg(feature = "network-tests")]
772 async fn test_fetch_search_index() {
773 let scraper = DocsRsScraper::new();
774
775 match scraper.fetch_search_index("serde", "1.0.0").await {
777 Ok(search_data) => {
778 assert_eq!(search_data.crate_name, "serde");
779 assert!(!search_data.items.is_empty());
780 }
781 Err(e) => {
782 println!(
784 "Search index test failed (expected in some environments): {}",
785 e
786 );
787 }
788 }
789 }
790}