git_x/
large_files.rs

1use crate::core::output::BufferedOutput;
2use crate::{GitXError, Result};
3use std::collections::HashMap;
4use std::process::Command;
5
6#[derive(Debug, Clone)]
7pub struct FileInfo {
8    pub path: String,
9    pub size_bytes: u64,
10    pub size_mb: f64,
11}
12
13impl FileInfo {
14    pub fn new(path: String, size_bytes: u64) -> Self {
15        let size_mb = size_bytes as f64 / (1024.0 * 1024.0);
16        Self {
17            path,
18            size_bytes,
19            size_mb,
20        }
21    }
22}
23
24pub fn run(limit: usize, threshold: Option<f64>) -> Result<()> {
25    let mut output = BufferedOutput::new();
26
27    output.add_line("šŸ” Scanning repository for large files...".to_string());
28
29    // Get all file objects and their sizes
30    let file_objects = get_file_objects().map_err(|e| GitXError::GitCommand(e.to_string()))?;
31
32    if file_objects.is_empty() {
33        output.add_line("ā„¹ļø No files found in repository history".to_string());
34        output.flush();
35        return Ok(());
36    }
37
38    // Find the largest files by path
39    let mut large_files = find_largest_files(file_objects, threshold);
40
41    // Sort by size (largest first)
42    large_files.sort_by(|a, b| b.size_bytes.cmp(&a.size_bytes));
43
44    // Limit results
45    large_files.truncate(limit);
46
47    if large_files.is_empty() {
48        output.add_line(match threshold {
49            Some(mb) => format!("āœ… No files found larger than {mb:.1} MB"),
50            None => "āœ… No large files found".to_string(),
51        });
52        output.flush();
53        return Ok(());
54    }
55
56    let count = large_files.len();
57    output.add_line(match threshold {
58        Some(mb) => format!("šŸ“Š Top {count} files larger than {mb:.1} MB:"),
59        None => format!("šŸ“Š Top {count} largest files:"),
60    });
61
62    // Add all file results to buffer
63    for (i, file) in large_files.iter().enumerate() {
64        output.add_line(format_file_line(i + 1, file));
65    }
66
67    // Add summary
68    let total_size: u64 = large_files.iter().map(|f| f.size_bytes).sum();
69    let total_mb = total_size as f64 / (1024.0 * 1024.0);
70    output.add_line(format_summary_message(large_files.len(), total_mb));
71
72    // Flush all output at once for better performance
73    output.flush();
74    Ok(())
75}
76
77fn get_file_objects() -> Result<Vec<(String, String, u64)>> {
78    let output = Command::new("git")
79        .args(get_rev_list_args())
80        .output()
81        .map_err(GitXError::Io)?;
82
83    if !output.status.success() {
84        return Err(GitXError::GitCommand(
85            "Failed to get file objects from git history".to_string(),
86        ));
87    }
88
89    let stdout = String::from_utf8_lossy(&output.stdout);
90    parse_git_objects(&stdout)
91}
92
93fn get_rev_list_args() -> [&'static str; 6] {
94    [
95        "rev-list",
96        "--objects",
97        "--all",
98        "--no-object-names",
99        "--filter=blob:none",
100        "--",
101    ]
102}
103
104fn parse_git_objects(output: &str) -> Result<Vec<(String, String, u64)>> {
105    let mut objects = Vec::new();
106
107    for line in output.lines() {
108        let hash = line.trim();
109        if hash.is_empty() || hash.len() != 40 {
110            continue;
111        }
112
113        // Get object size
114        if let Ok(size) = get_object_size(hash) {
115            if size > 0 {
116                // Get file paths for this object
117                if let Ok(paths) = get_object_paths(hash) {
118                    for path in paths {
119                        objects.push((hash.to_string(), path, size));
120                    }
121                }
122            }
123        }
124    }
125
126    Ok(objects)
127}
128
129fn get_object_size(hash: &str) -> Result<u64> {
130    let output = Command::new("git")
131        .args(["cat-file", "-s", hash])
132        .output()
133        .map_err(GitXError::Io)?;
134
135    if !output.status.success() {
136        return Err(GitXError::GitCommand(
137            "Failed to get object size".to_string(),
138        ));
139    }
140
141    let size_str = String::from_utf8_lossy(&output.stdout);
142    size_str
143        .trim()
144        .parse()
145        .map_err(|_| GitXError::Parse("Invalid size format".to_string()))
146}
147
148fn get_object_paths(hash: &str) -> Result<Vec<String>> {
149    let output = Command::new("git")
150        .args([
151            "log",
152            "--all",
153            "--pretty=format:",
154            "--name-only",
155            "--diff-filter=A",
156            "-S",
157            hash,
158        ])
159        .output()
160        .map_err(GitXError::Io)?;
161
162    if !output.status.success() {
163        // Fallback: try to find the path using rev-list with object names
164        return get_object_paths_fallback(hash);
165    }
166
167    let stdout = String::from_utf8_lossy(&output.stdout);
168    let paths: Vec<String> = stdout
169        .lines()
170        .filter(|line| !line.trim().is_empty())
171        .map(|line| line.trim().to_string())
172        .collect();
173
174    if paths.is_empty() {
175        get_object_paths_fallback(hash)
176    } else {
177        Ok(paths)
178    }
179}
180
181// Fallback method to get object paths
182fn get_object_paths_fallback(hash: &str) -> Result<Vec<String>> {
183    let output = Command::new("git")
184        .args(["rev-list", "--objects", "--all"])
185        .output()
186        .map_err(GitXError::Io)?;
187
188    let stdout = String::from_utf8_lossy(&output.stdout);
189    let paths: Vec<String> = stdout
190        .lines()
191        .filter_map(|line| {
192            let parts: Vec<&str> = line.split_whitespace().collect();
193            if parts.len() >= 2 && parts[0] == hash {
194                Some(parts[1..].join(" "))
195            } else {
196                None
197            }
198        })
199        .collect();
200
201    if paths.is_empty() {
202        Ok(vec![format!("unknown-{}", &hash[0..8])])
203    } else {
204        Ok(paths)
205    }
206}
207
208fn find_largest_files(
209    objects: Vec<(String, String, u64)>,
210    threshold: Option<f64>,
211) -> Vec<FileInfo> {
212    let mut file_sizes: HashMap<String, u64> = HashMap::new();
213
214    // Group by file path and take the maximum size
215    for (_hash, path, size) in objects {
216        file_sizes
217            .entry(path)
218            .and_modify(|current| *current = (*current).max(size))
219            .or_insert(size);
220    }
221
222    let threshold_bytes = threshold.map(|mb| (mb * 1024.0 * 1024.0) as u64);
223
224    file_sizes
225        .into_iter()
226        .filter(|(_, size)| threshold_bytes.is_none_or(|threshold| *size >= threshold))
227        .map(|(path, size)| FileInfo::new(path, size))
228        .collect()
229}
230
231pub fn format_file_line(index: usize, file: &FileInfo) -> String {
232    format!(
233        "{index:2}. {size:>8.1} MB  {path}",
234        size = file.size_mb,
235        path = file.path
236    )
237}
238
239fn format_summary_message(count: usize, total_mb: f64) -> String {
240    format!("\nšŸ“ˆ Total: {count} files, {total_mb:.1} MB")
241}
242
243pub fn format_size_human_readable(bytes: u64) -> String {
244    const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"];
245    let mut size = bytes as f64;
246    let mut unit_index = 0;
247
248    while size >= 1024.0 && unit_index < UNITS.len() - 1 {
249        size /= 1024.0;
250        unit_index += 1;
251    }
252
253    if unit_index == 0 {
254        format!("{size:.0} {}", UNITS[unit_index])
255    } else {
256        format!("{size:.1} {}", UNITS[unit_index])
257    }
258}