repcon/
file_splitting.rs

1use crate::PageFormat;
2use std::fs::File;
3use std::io::{self, BufRead, BufReader, Write};
4use std::path::{Path, PathBuf};
5
6/// Creates a new output file in the specified directory with a given name and counter.
7/// Returns a tuple containing the created File and its PathBuf.
8fn create_new_output_file(
9    output_directory: &Path,
10    file_counter: u64,
11    output_name: &str,
12) -> io::Result<(File, PathBuf)> {
13    let file_path = output_directory.join(format!("{}_{}.txt", output_name, file_counter));
14    let file = File::create(&file_path)?;
15    Ok((file, file_path))
16}
17
18/// Checks if the size of the page header and footer exceeds the maximum allowed file size.
19/// Returns an error if the combined size is too large.
20fn check_max_output_file_size(
21    page_format: &PageFormat,
22    max_output_file_size: u64,
23) -> io::Result<()> {
24    if page_format.get_page_header_size() + page_format.get_page_footer_size()
25        > max_output_file_size
26    {
27        let error_message = format!(
28            "Error: The maximum file size ({}) is too small to contain the page header and footer.",
29            max_output_file_size
30        );
31        Err(io::Error::new(io::ErrorKind::InvalidData, error_message))
32    } else {
33        Ok(())
34    }
35}
36
37/// Creates a new file for output, updates the file counter and size, and adds the file to the list.
38/// This function is used when the current file reaches its maximum size and a new file is needed.
39fn next_output_file(
40    output_directory: &Path,
41    output_file_counter: &mut u64,
42    output_name: &str,
43    current_output_file_size: &mut u64,
44    generated_output_files: &mut Vec<PathBuf>,
45    output_file: &mut File,
46    output_file_path: &mut PathBuf,
47) -> io::Result<()> {
48    *output_file_counter += 1;
49    *current_output_file_size = 0;
50    (*output_file, *output_file_path) =
51        create_new_output_file(output_directory, *output_file_counter, output_name)?;
52    generated_output_files.push(output_file_path.to_path_buf());
53    Ok(())
54}
55
56/// Splits the target files into chunks based on a maximum file size.
57/// Generates multiple files if necessary, each containing a portion of the target files.
58/// Returns a vector of paths to the generated files.
59///
60/// # Examples
61///
62/// ```
63/// use repcon::split_files_into_chunks;
64/// use std::path::{Path, PathBuf};
65/// use std::fs::File;
66/// use std::io::Write;
67///
68/// // Suppose you have a directory with files that you want to split
69/// let output_directory = Path::new("./tests/output");
70/// let target_files_root_path = Some(Path::new("./"));
71/// let target_files = vec![
72///     PathBuf::from("./src/main.rs"),
73///     PathBuf::from("./src/lib.rs"),
74/// ];
75/// let max_output_file_size = 2048; // 2KB max file size
76/// let output_name = "chunked_file";
77///
78/// let generated_files = split_files_into_chunks(
79///     &target_files,
80///     target_files_root_path,
81///     output_directory,
82///     max_output_file_size,
83///     output_name,
84/// ).unwrap();
85/// ```
86///
87/// # Errors
88///
89/// This function will return an `Err` if the file paths contain invalid UTF-8 characters
90/// or if the maximum file size is too small to contain even one chunk of the target files.
91pub fn split_files_into_chunks(
92    target_files: &[PathBuf],
93    target_files_root_path: Option<&Path>,
94    output_directory: &Path,
95    max_output_file_size: u64,
96    output_name: &str,
97) -> io::Result<Vec<PathBuf>> {
98    let mut generated_output_files = Vec::new();
99    let mut output_file_counter: u64 = 1;
100    let mut current_output_file_size: u64 = 0;
101    let mut current_target_file_name: String;
102    let mut page_format: PageFormat;
103
104    // Create the first file
105    let (mut output_file, mut output_file_path) =
106        create_new_output_file(output_directory, output_file_counter, output_name)?;
107    generated_output_files.push(output_file_path.clone());
108
109    for target_file_path in target_files {
110        current_target_file_name = match target_file_path.to_str() {
111            Some(name) => name.to_string(),
112            None => {
113                return Err(io::Error::new(
114                    io::ErrorKind::InvalidData,
115                    "Target File path contains invalid UTF-8 characters",
116                ));
117            }
118        };
119
120        let file = match File::open(target_file_path) {
121            Ok(file) => file,
122            Err(e) => {
123                eprintln!("Failed to open file {:?}: {}", target_file_path, e);
124                continue;
125            }
126        };
127
128        page_format = PageFormat::new(current_target_file_name, target_files_root_path);
129        check_max_output_file_size(&page_format, max_output_file_size)?;
130
131        if current_output_file_size + page_format.header_size + page_format.footer_size
132            > max_output_file_size
133        {
134            next_output_file(
135                output_directory,
136                &mut output_file_counter,
137                output_name,
138                &mut current_output_file_size,
139                &mut generated_output_files,
140                &mut output_file,
141                &mut output_file_path.clone(),
142            )?;
143        }
144
145        write!(output_file, "{}", page_format.header)?;
146        current_output_file_size += page_format.header_size;
147
148        let reader = BufReader::new(file);
149        for line_result in reader.lines() {
150            if line_result.is_err() {
151                eprintln!("Skipping non-text file: {:?}", target_file_path);
152                break;
153            }
154            let line = line_result.unwrap();
155            let line_size = line.as_bytes().len() as u64 + 1; // +1 for the newline character
156
157            if current_output_file_size + line_size + page_format.footer_size > max_output_file_size
158            {
159                write!(output_file, "{}", page_format.footer)?;
160
161                next_output_file(
162                    output_directory,
163                    &mut output_file_counter,
164                    output_name,
165                    &mut current_output_file_size,
166                    &mut generated_output_files,
167                    &mut output_file,
168                    &mut output_file_path,
169                )?;
170
171                page_format.increment_page_number();
172                check_max_output_file_size(&page_format, max_output_file_size)?;
173                write!(output_file, "{}", page_format.header)?;
174                current_output_file_size += page_format.header_size;
175            }
176
177            writeln!(output_file, "{}", line)?;
178            current_output_file_size += line_size;
179        }
180        write!(output_file, "{}", page_format.footer)?;
181    }
182    Ok(generated_output_files)
183}
184
185#[cfg(test)]
186mod split_tests {
187    use super::*;
188    use std::fs::{self, File};
189    use std::io::Write;
190    use tempfile::tempdir;
191
192    #[test]
193    fn test_split_files_into_small_chunks() -> io::Result<()> {
194        let temp_dir = tempdir()?;
195        let max_output_file_size = 200;
196        let output_name = "output";
197        let num_test_files = 5;
198        let mut files = Vec::new();
199
200        for i in 0..num_test_files {
201            let file_path = temp_dir.path().join(format!("test_file_{}.txt", i));
202            let mut test_file = File::create(&file_path)?;
203            writeln!(test_file, "Test data for file {}", i)?;
204            files.push(file_path);
205        }
206
207        let output_directory = temp_dir.path();
208        let generated_output_files = split_files_into_chunks(
209            &files,
210            Some(temp_dir.path()),
211            output_directory,
212            max_output_file_size,
213            output_name,
214        )?;
215
216        assert!(!generated_output_files.is_empty());
217        assert_eq!(generated_output_files.len(), num_test_files);
218        for generated_file_path in generated_output_files {
219            let generated_file_content = fs::read_to_string(generated_file_path)?;
220            assert!(generated_file_content.contains("// START OF CODE BLOCK"));
221            assert!(generated_file_content.contains("// END OF CODE BLOCK"));
222            assert!(generated_file_content.len() as u64 <= max_output_file_size);
223        }
224
225        Ok(())
226    }
227
228    #[test]
229    fn test_split_files_into_single_large_chunk() -> io::Result<()> {
230        let temp_dir = tempdir()?;
231        let max_output_file_size = 3000;
232        let output_name = "output";
233        let num_test_files = 5;
234        let mut files = Vec::new();
235
236        for i in 0..num_test_files {
237            let file_path = temp_dir.path().join(format!("test_file_{}.txt", i));
238            let mut test_file = File::create(&file_path)?;
239            writeln!(test_file, "Test data for file {}", i)?;
240            files.push(file_path);
241        }
242
243        let output_directory = temp_dir.path();
244        let generated_output_files = split_files_into_chunks(
245            &files,
246            None,
247            output_directory,
248            max_output_file_size,
249            output_name,
250        )?;
251
252        assert!(!generated_output_files.is_empty());
253        assert_eq!(generated_output_files.len(), 1);
254        for generated_file_path in generated_output_files {
255            let generated_file_content = fs::read_to_string(generated_file_path)?;
256            assert!(generated_file_content.contains("// START OF CODE BLOCK"));
257            assert!(generated_file_content.contains("// END OF CODE BLOCK"));
258            assert!(generated_file_content.len() as u64 <= max_output_file_size);
259        }
260
261        Ok(())
262    }
263
264    #[test]
265    fn test_split_files_with_insufficient_size() -> io::Result<()> {
266        let temp_dir = tempdir()?;
267        let max_output_file_size = 10;
268        let output_name = "output";
269        let mut files = Vec::new();
270
271        let file_path = temp_dir.path().join("test_file.txt");
272        let mut test_file = File::create(&file_path)?;
273        writeln!(test_file, "Test data for file")?;
274        files.push(file_path);
275
276        let output_directory = temp_dir.path();
277        let result = split_files_into_chunks(
278            &files,
279            None,
280            output_directory,
281            max_output_file_size,
282            output_name,
283        );
284
285        assert!(result.is_err());
286        Ok(())
287    }
288
289    #[test]
290    fn test_split_binary_files_error() -> io::Result<()> {
291        let temp_dir = tempdir()?;
292        let max_output_file_size = 300;
293        let output_name = "output";
294        let mut files = Vec::new();
295
296        let file_path = temp_dir.path().join("test_file.bin");
297        let mut test_file = File::create(&file_path)?;
298        test_file.write_all(&[0, 159, 146, 150])?;
299        files.push(file_path);
300
301        let output_directory = temp_dir.path();
302        let generated_output_files = split_files_into_chunks(
303            &files,
304            None,
305            output_directory,
306            max_output_file_size,
307            output_name,
308        )?;
309
310        assert_eq!(generated_output_files.len(), 1);
311        for generated_file_path in generated_output_files {
312            let generated_file_content = fs::read_to_string(generated_file_path)?;
313            assert!(generated_file_content.contains("// START OF CODE BLOCK"));
314            assert!(generated_file_content.contains("// END OF CODE BLOCK"));
315            assert!(generated_file_content.len() as u64 <= max_output_file_size);
316        }
317
318        Ok(())
319    }
320}