Skip to main content

searchfox_lib/
blame.rs

1use crate::client::SearchfoxClient;
2use crate::types::{BlameInfo, CommitInfo, ParsedCommitInfo};
3use crate::utils::searchfox_url_repo;
4use anyhow::Result;
5use regex::Regex;
6use scraper::{Html, Selector};
7use std::collections::HashMap;
8
9impl SearchfoxClient {
10    pub async fn get_head_hash(&self) -> anyhow::Result<String> {
11        let url = format!(
12            "https://searchfox.org/{}/commit-info/HEAD",
13            searchfox_url_repo(&self.repo)
14        );
15        let response = self.get_raw(&url).await?;
16        let json: serde_json::Value = serde_json::from_str(&response)
17            .map_err(|_| anyhow::anyhow!("Failed to parse HEAD commit info"))?;
18        json.as_array()
19            .and_then(|arr| arr.first())
20            .and_then(|commit| commit.get("parent"))
21            .and_then(|p| p.as_str())
22            .map(|s| s.to_string())
23            .ok_or_else(|| anyhow::anyhow!("Could not find HEAD revision hash in commit-info"))
24    }
25
26    /// Fetch blame data for specific lines in a file
27    pub async fn get_blame_for_lines(
28        &self,
29        path: &str,
30        lines: &[usize],
31    ) -> Result<HashMap<usize, BlameInfo>> {
32        // Fetch the HTML page for the file
33        let url = format!("https://searchfox.org/{}/source/{}", self.repo, path);
34        let html = self.get_html(&url).await?;
35
36        // Parse blame data from HTML
37        let blame_map = Self::parse_blame_from_html(&html)?;
38
39        // Filter to only the requested lines
40        let filtered_blame: HashMap<usize, (String, String, usize)> = blame_map
41            .into_iter()
42            .filter(|(line_no, _)| lines.contains(line_no))
43            .collect();
44
45        if filtered_blame.is_empty() {
46            return Ok(HashMap::new());
47        }
48
49        // Collect unique commit hashes
50        let unique_commits: Vec<&str> = {
51            let mut commits: Vec<&str> = filtered_blame
52                .values()
53                .map(|(hash, _, _)| hash.as_str())
54                .collect();
55            commits.sort_unstable();
56            commits.dedup();
57            commits
58        };
59
60        // Batch fetch commit info
61        let commit_infos = self.get_commit_info(&unique_commits).await?;
62
63        // Build commit hash -> CommitInfo map
64        let commit_map: HashMap<String, CommitInfo> = unique_commits
65            .into_iter()
66            .zip(commit_infos.into_iter())
67            .map(|(hash, info)| (hash.to_string(), info))
68            .collect();
69
70        // Build final result
71        let result = filtered_blame
72            .into_iter()
73            .map(|(line_no, (hash, path, orig_line))| {
74                let commit_info = commit_map.get(&hash).cloned();
75                let blame_info = BlameInfo {
76                    commit_hash: hash.clone(),
77                    original_path: path,
78                    original_line: orig_line,
79                    commit_info,
80                };
81                (line_no, blame_info)
82            })
83            .collect();
84
85        Ok(result)
86    }
87
88    /// Fetch commit info for commit hashes (batched to avoid 414 URI Too Long)
89    async fn get_commit_info(&self, revs: &[&str]) -> Result<Vec<CommitInfo>> {
90        if revs.is_empty() {
91            return Ok(Vec::new());
92        }
93
94        // Batch requests to avoid hitting URL length limits (414 errors)
95        // Each hash is 40 chars + 1 comma, so ~50 hashes should be safe
96        const BATCH_SIZE: usize = 50;
97
98        let mut all_infos = Vec::new();
99
100        for chunk in revs.chunks(BATCH_SIZE) {
101            let revs_str = chunk.join(",");
102            let url = format!(
103                "https://searchfox.org/{}/commit-info/{}",
104                self.repo, revs_str
105            );
106
107            let response = self.get_raw(&url).await?;
108            let mut commit_infos: Vec<CommitInfo> = serde_json::from_str(&response)?;
109            all_infos.append(&mut commit_infos);
110        }
111
112        Ok(all_infos)
113    }
114
115    /// Parse blame data from HTML, returns map of line -> (commit_hash, path, original_line)
116    fn parse_blame_from_html(html: &str) -> Result<HashMap<usize, (String, String, usize)>> {
117        let document = Html::parse_document(html);
118        let blame_selector = Selector::parse(".blame-strip").unwrap();
119        let line_selector = Selector::parse("div[role='row']").unwrap();
120
121        let mut result = HashMap::new();
122        let mut line_number = 1;
123
124        // The searchfox HTML structure has rows with role="row"
125        // Each row contains a blame-strip div and code
126        for row in document.select(&line_selector) {
127            // Try to find a blame-strip in this row
128            if let Some(blame_elem) = row.select(&blame_selector).next() {
129                if let Some(blame_data) = blame_elem.value().attr("data-blame") {
130                    if let Some((hash, path, orig_line)) = Self::parse_data_blame(blame_data) {
131                        result.insert(line_number, (hash, path, orig_line));
132                    }
133                }
134            }
135            line_number += 1;
136        }
137
138        log::debug!("Parsed {} blame entries from HTML", result.len());
139        Ok(result)
140    }
141
142    /// Parse data-blame attribute format: "hash#path#lineno"
143    /// % in path means "same as current file"
144    fn parse_data_blame(data: &str) -> Option<(String, String, usize)> {
145        let parts: Vec<&str> = data.split('#').collect();
146        if parts.len() != 3 {
147            return None;
148        }
149
150        let hash = parts[0].to_string();
151        let path = parts[1].to_string();
152        let line_no = parts[2].parse::<usize>().ok()?;
153
154        Some((hash, path, line_no))
155    }
156}
157
158/// Parse commit header HTML to extract structured information
159pub fn parse_commit_header(header: &str) -> ParsedCommitInfo {
160    // Remove HTML tags for parsing
161    let text = strip_html_tags(header);
162
163    // Try to extract bug number
164    let bug_number = extract_bug_number(&text);
165
166    // Split by newline or <br> to separate message from author/date
167    let parts: Vec<&str> = text.split('\n').collect();
168
169    let message = if let Some(first_part) = parts.first() {
170        // Remove "Bug XXXXXX: " prefix if present
171        if let Some(idx) = first_part.find(':') {
172            first_part[idx + 1..].trim().to_string()
173        } else {
174            first_part.trim().to_string()
175        }
176    } else {
177        String::new()
178    };
179
180    // Second part should contain author and date
181    let (author, date) = if parts.len() > 1 {
182        parse_author_date(parts[1])
183    } else {
184        (String::new(), String::new())
185    };
186
187    ParsedCommitInfo {
188        bug_number,
189        message,
190        author,
191        date,
192    }
193}
194
195fn strip_html_tags(html: &str) -> String {
196    let tag_re = Regex::new(r"<[^>]+>").unwrap();
197    let without_tags = tag_re.replace_all(html, "");
198
199    // Decode common HTML entities
200    without_tags
201        .replace("&lt;", "<")
202        .replace("&gt;", ">")
203        .replace("&amp;", "&")
204        .replace("&quot;", "\"")
205        .replace("&#39;", "'")
206}
207
208fn extract_bug_number(text: &str) -> Option<u64> {
209    let bug_re = Regex::new(r"[Bb]ug\s+(\d+)").unwrap();
210    bug_re
211        .captures(text)
212        .and_then(|cap| cap.get(1))
213        .and_then(|m| m.as_str().parse::<u64>().ok())
214}
215
216fn parse_author_date(text: &str) -> (String, String) {
217    // Format is typically "Author, Date"
218    let parts: Vec<&str> = text.split(',').collect();
219    if parts.len() >= 2 {
220        let author = parts[0].trim().to_string();
221        let date = parts[1..].join(",").trim().to_string();
222        (author, date)
223    } else {
224        (text.trim().to_string(), String::new())
225    }
226}
227
228#[cfg(test)]
229mod tests {
230    use super::*;
231
232    #[test]
233    fn test_parse_data_blame() {
234        let data = "88a286dcec9ba069397bd4c4c35b3e317bf66f4f#%#7";
235        let result = SearchfoxClient::parse_data_blame(data);
236        assert!(result.is_some());
237
238        let (hash, path, line) = result.unwrap();
239        assert_eq!(hash, "88a286dcec9ba069397bd4c4c35b3e317bf66f4f");
240        assert_eq!(path, "%");
241        assert_eq!(line, 7);
242    }
243
244    #[test]
245    fn test_parse_commit_header() {
246        let header =
247            "Bug <a href=\"...\">123456</a>: Fix audio issue\n<br><i>John Doe, 2021-05-15</i>";
248        let result = parse_commit_header(header);
249
250        assert_eq!(result.bug_number, Some(123456));
251        assert_eq!(result.message, "Fix audio issue");
252        assert_eq!(result.author, "John Doe");
253        assert_eq!(result.date, "2021-05-15");
254    }
255
256    #[test]
257    fn test_strip_html_tags() {
258        let html = "Bug <a href=\"url\">123</a>: message";
259        let result = strip_html_tags(html);
260        assert_eq!(result, "Bug 123: message");
261    }
262}