Skip to main content

searchfox_lib/
blame.rs

1use crate::client::SearchfoxClient;
2use crate::types::{BlameInfo, CommitInfo, ParsedCommitInfo};
3use anyhow::Result;
4use regex::Regex;
5use scraper::{Html, Selector};
6use std::collections::HashMap;
7
8impl SearchfoxClient {
9    /// Fetch blame data for specific lines in a file
10    pub async fn get_blame_for_lines(
11        &self,
12        path: &str,
13        lines: &[usize],
14    ) -> Result<HashMap<usize, BlameInfo>> {
15        // Fetch the HTML page for the file
16        let url = format!("https://searchfox.org/{}/source/{}", self.repo, path);
17        let html = self.get_html(&url).await?;
18
19        // Parse blame data from HTML
20        let blame_map = Self::parse_blame_from_html(&html)?;
21
22        // Filter to only the requested lines
23        let filtered_blame: HashMap<usize, (String, String, usize)> = blame_map
24            .into_iter()
25            .filter(|(line_no, _)| lines.contains(line_no))
26            .collect();
27
28        if filtered_blame.is_empty() {
29            return Ok(HashMap::new());
30        }
31
32        // Collect unique commit hashes
33        let unique_commits: Vec<&str> = {
34            let mut commits: Vec<&str> = filtered_blame
35                .values()
36                .map(|(hash, _, _)| hash.as_str())
37                .collect();
38            commits.sort_unstable();
39            commits.dedup();
40            commits
41        };
42
43        // Batch fetch commit info
44        let commit_infos = self.get_commit_info(&unique_commits).await?;
45
46        // Build commit hash -> CommitInfo map
47        let commit_map: HashMap<String, CommitInfo> = unique_commits
48            .into_iter()
49            .zip(commit_infos.into_iter())
50            .map(|(hash, info)| (hash.to_string(), info))
51            .collect();
52
53        // Build final result
54        let result = filtered_blame
55            .into_iter()
56            .map(|(line_no, (hash, path, orig_line))| {
57                let commit_info = commit_map.get(&hash).cloned();
58                let blame_info = BlameInfo {
59                    commit_hash: hash.clone(),
60                    original_path: path,
61                    original_line: orig_line,
62                    commit_info,
63                };
64                (line_no, blame_info)
65            })
66            .collect();
67
68        Ok(result)
69    }
70
71    /// Fetch commit info for commit hashes (batched to avoid 414 URI Too Long)
72    async fn get_commit_info(&self, revs: &[&str]) -> Result<Vec<CommitInfo>> {
73        if revs.is_empty() {
74            return Ok(Vec::new());
75        }
76
77        // Batch requests to avoid hitting URL length limits (414 errors)
78        // Each hash is 40 chars + 1 comma, so ~50 hashes should be safe
79        const BATCH_SIZE: usize = 50;
80
81        let mut all_infos = Vec::new();
82
83        for chunk in revs.chunks(BATCH_SIZE) {
84            let revs_str = chunk.join(",");
85            let url = format!(
86                "https://searchfox.org/{}/commit-info/{}",
87                self.repo, revs_str
88            );
89
90            let response = self.get_raw(&url).await?;
91            let mut commit_infos: Vec<CommitInfo> = serde_json::from_str(&response)?;
92            all_infos.append(&mut commit_infos);
93        }
94
95        Ok(all_infos)
96    }
97
98    /// Parse blame data from HTML, returns map of line -> (commit_hash, path, original_line)
99    fn parse_blame_from_html(html: &str) -> Result<HashMap<usize, (String, String, usize)>> {
100        let document = Html::parse_document(html);
101        let blame_selector = Selector::parse(".blame-strip").unwrap();
102        let line_selector = Selector::parse("div[role='row']").unwrap();
103
104        let mut result = HashMap::new();
105        let mut line_number = 1;
106
107        // The searchfox HTML structure has rows with role="row"
108        // Each row contains a blame-strip div and code
109        for row in document.select(&line_selector) {
110            // Try to find a blame-strip in this row
111            if let Some(blame_elem) = row.select(&blame_selector).next() {
112                if let Some(blame_data) = blame_elem.value().attr("data-blame") {
113                    if let Some((hash, path, orig_line)) = Self::parse_data_blame(blame_data) {
114                        result.insert(line_number, (hash, path, orig_line));
115                    }
116                }
117            }
118            line_number += 1;
119        }
120
121        log::debug!("Parsed {} blame entries from HTML", result.len());
122        Ok(result)
123    }
124
125    /// Parse data-blame attribute format: "hash#path#lineno"
126    /// % in path means "same as current file"
127    fn parse_data_blame(data: &str) -> Option<(String, String, usize)> {
128        let parts: Vec<&str> = data.split('#').collect();
129        if parts.len() != 3 {
130            return None;
131        }
132
133        let hash = parts[0].to_string();
134        let path = parts[1].to_string();
135        let line_no = parts[2].parse::<usize>().ok()?;
136
137        Some((hash, path, line_no))
138    }
139}
140
141/// Parse commit header HTML to extract structured information
142pub fn parse_commit_header(header: &str) -> ParsedCommitInfo {
143    // Remove HTML tags for parsing
144    let text = strip_html_tags(header);
145
146    // Try to extract bug number
147    let bug_number = extract_bug_number(&text);
148
149    // Split by newline or <br> to separate message from author/date
150    let parts: Vec<&str> = text.split('\n').collect();
151
152    let message = if let Some(first_part) = parts.first() {
153        // Remove "Bug XXXXXX: " prefix if present
154        if let Some(idx) = first_part.find(':') {
155            first_part[idx + 1..].trim().to_string()
156        } else {
157            first_part.trim().to_string()
158        }
159    } else {
160        String::new()
161    };
162
163    // Second part should contain author and date
164    let (author, date) = if parts.len() > 1 {
165        parse_author_date(parts[1])
166    } else {
167        (String::new(), String::new())
168    };
169
170    ParsedCommitInfo {
171        bug_number,
172        message,
173        author,
174        date,
175    }
176}
177
178fn strip_html_tags(html: &str) -> String {
179    let tag_re = Regex::new(r"<[^>]+>").unwrap();
180    let without_tags = tag_re.replace_all(html, "");
181
182    // Decode common HTML entities
183    without_tags
184        .replace("&lt;", "<")
185        .replace("&gt;", ">")
186        .replace("&amp;", "&")
187        .replace("&quot;", "\"")
188        .replace("&#39;", "'")
189}
190
191fn extract_bug_number(text: &str) -> Option<u64> {
192    let bug_re = Regex::new(r"[Bb]ug\s+(\d+)").unwrap();
193    bug_re
194        .captures(text)
195        .and_then(|cap| cap.get(1))
196        .and_then(|m| m.as_str().parse::<u64>().ok())
197}
198
199fn parse_author_date(text: &str) -> (String, String) {
200    // Format is typically "Author, Date"
201    let parts: Vec<&str> = text.split(',').collect();
202    if parts.len() >= 2 {
203        let author = parts[0].trim().to_string();
204        let date = parts[1..].join(",").trim().to_string();
205        (author, date)
206    } else {
207        (text.trim().to_string(), String::new())
208    }
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214
215    #[test]
216    fn test_parse_data_blame() {
217        let data = "88a286dcec9ba069397bd4c4c35b3e317bf66f4f#%#7";
218        let result = SearchfoxClient::parse_data_blame(data);
219        assert!(result.is_some());
220
221        let (hash, path, line) = result.unwrap();
222        assert_eq!(hash, "88a286dcec9ba069397bd4c4c35b3e317bf66f4f");
223        assert_eq!(path, "%");
224        assert_eq!(line, 7);
225    }
226
227    #[test]
228    fn test_parse_commit_header() {
229        let header =
230            "Bug <a href=\"...\">123456</a>: Fix audio issue\n<br><i>John Doe, 2021-05-15</i>";
231        let result = parse_commit_header(header);
232
233        assert_eq!(result.bug_number, Some(123456));
234        assert_eq!(result.message, "Fix audio issue");
235        assert_eq!(result.author, "John Doe");
236        assert_eq!(result.date, "2021-05-15");
237    }
238
239    #[test]
240    fn test_strip_html_tags() {
241        let html = "Bug <a href=\"url\">123</a>: message";
242        let result = strip_html_tags(html);
243        assert_eq!(result, "Bug 123: message");
244    }
245}