scancode_rust/scanner/
process.rs

1use crate::askalono::{ScanStrategy, TextData};
2use crate::models::{FileInfo, FileInfoBuilder, FileType, LicenseDetection, Match};
3use crate::parsers::{CargoParser, NpmParser, PythonParser, PackageParser};
4use crate::scanner::ProcessResult;
5use crate::utils::file::{get_creation_date, is_path_excluded};
6use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha256};
7use crate::utils::language::detect_language;
8use anyhow::Error;
9use content_inspector::{ContentType, inspect};
10use glob::Pattern;
11use indicatif::ProgressBar;
12use mime_guess::from_path;
13use rayon::prelude::*;
14use std::fs::{self};
15use std::path::Path;
16use std::sync::Arc;
17
18// License detection threshold - scores above this value are considered a match
19
20pub fn process<P: AsRef<Path>>(
21    path: P,
22    max_depth: usize,
23    progress_bar: Arc<ProgressBar>,
24    exclude_patterns: &[Pattern],
25    scan_strategy: &ScanStrategy,
26) -> Result<ProcessResult, Error> {
27    let path = path.as_ref();
28
29    if is_path_excluded(path, exclude_patterns) {
30        return Ok(ProcessResult {
31            files: Vec::new(),
32            excluded_count: 1,
33        });
34    }
35
36    let mut all_files = Vec::new();
37    let mut total_excluded = 0;
38
39    // Read directory entries and group by exclusion status and type
40    let entries: Vec<_> = fs::read_dir(path)?.filter_map(Result::ok).collect();
41
42    let mut file_entries = Vec::new();
43    let mut dir_entries = Vec::new();
44
45    for entry in entries {
46        let path = entry.path();
47
48        // Check exclusion only once per path
49        if is_path_excluded(&path, exclude_patterns) {
50            total_excluded += 1;
51            continue;
52        }
53
54        match fs::metadata(&path) {
55            Ok(metadata) if metadata.is_file() => file_entries.push((path, metadata)),
56            Ok(metadata) if path.is_dir() => dir_entries.push((path, metadata)),
57            _ => continue,
58        }
59    }
60
61    // Process files in parallel
62    all_files.append(
63        &mut file_entries
64            .par_iter()
65            .map(|(path, metadata)| {
66                let file_entry = process_file(path, metadata, scan_strategy);
67                progress_bar.inc(1);
68                file_entry
69            })
70            .collect(),
71    );
72
73    // Process directories
74    for (path, metadata) in dir_entries {
75        all_files.push(process_directory(&path, &metadata));
76
77        if max_depth > 0 {
78            match process(
79                &path,
80                max_depth - 1,
81                progress_bar.clone(),
82                exclude_patterns,
83                scan_strategy,
84            ) {
85                Ok(mut result) => {
86                    all_files.append(&mut result.files);
87                    total_excluded += result.excluded_count;
88                }
89                Err(e) => eprintln!("Error processing directory {}: {}", path.display(), e),
90            }
91        }
92    }
93
94    Ok(ProcessResult {
95        files: all_files,
96        excluded_count: total_excluded,
97    })
98}
99
100fn process_file(path: &Path, metadata: &fs::Metadata, scan_strategy: &ScanStrategy) -> FileInfo {
101    let mut scan_errors: Vec<String> = vec![];
102    let mut file_info_builder = FileInfoBuilder::default();
103
104    if let Err(e) = extract_information_from_content(&mut file_info_builder, path, scan_strategy) {
105        scan_errors.push(e.to_string());
106    };
107
108    return file_info_builder
109        .name(path.file_name().unwrap().to_string_lossy().to_string())
110        .base_name(
111            path.file_stem()
112                .unwrap_or_default()
113                .to_string_lossy()
114                .to_string(),
115        )
116        .extension(
117            path.extension()
118                .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
119        )
120        .path(path.to_string_lossy().to_string())
121        .file_type(FileType::File)
122        .mime_type(Some(
123            from_path(path)
124                .first_or_octet_stream()
125                .essence_str()
126                .to_string(),
127        ))
128        .size(metadata.len())
129        .date(get_creation_date(metadata))
130        .scan_errors(scan_errors)
131        .build()
132        .expect("FileInformationBuild not completely initialized");
133}
134
135fn extract_information_from_content(
136    file_info_builder: &mut FileInfoBuilder,
137    path: &Path,
138    scan_strategy: &ScanStrategy,
139) -> Result<(), Error> {
140    let buffer = fs::read(path)?;
141
142    file_info_builder
143        .sha1(Some(calculate_sha1(&buffer)))
144        .md5(Some(calculate_md5(&buffer)))
145        .sha256(Some(calculate_sha256(&buffer)))
146        .programming_language(Some(detect_language(path, &buffer)));
147
148    if NpmParser::is_match(path) {
149        let package_data = vec![NpmParser::extract_package_data(path)];
150        file_info_builder.package_data(package_data);
151        Ok(())
152    } else if CargoParser::is_match(path) {
153        let package_data = vec![CargoParser::extract_package_data(path)];
154        file_info_builder.package_data(package_data);
155        Ok(())
156    } else if PythonParser::is_match(path) {
157        let package_data = vec![PythonParser::extract_package_data(path)];
158        file_info_builder.package_data(package_data);
159        Ok(())
160    } else if inspect(&buffer) == ContentType::UTF_8 {
161        extract_license_information(
162            file_info_builder,
163            String::from_utf8_lossy(&buffer).into_owned(),
164            scan_strategy,
165        )
166    } else {
167        Ok(())
168    }
169}
170
171fn extract_license_information(
172    file_info_builder: &mut FileInfoBuilder,
173    text_content: String,
174    scan_strategy: &ScanStrategy,
175) -> Result<(), Error> {
176    // Analyze license with the text content
177    if text_content.is_empty() {
178        return Ok(());
179    }
180
181    let license_result = scan_strategy.scan(&TextData::from(text_content.as_str()))?;
182    let license_expr = license_result
183        .license
184        .and_then(|x| Some(x.name.to_string()));
185
186    let license_detections = license_result
187        .containing
188        .iter()
189        .map(|detection| LicenseDetection {
190            license_expression: detection.license.name.to_string(),
191            matches: vec![Match {
192                score: detection.score as f64,
193                start_line: detection.line_range.0,
194                end_line: detection.line_range.1,
195                license_expression: detection.license.name.to_string(),
196                matched_text: None, //TODO
197                rule_identifier: None,
198            }],
199        })
200        .collect::<Vec<_>>();
201
202    file_info_builder
203        .license_expression(license_expr)
204        .license_detections(license_detections);
205
206    Ok(())
207}
208
209fn process_directory(path: &Path, metadata: &fs::Metadata) -> FileInfo {
210    let name = path
211        .file_name()
212        .unwrap_or_default()
213        .to_string_lossy()
214        .to_string();
215    let base_name = name.clone(); // For directories, base_name is the same as name
216
217    FileInfo {
218        name,
219        base_name,
220        extension: "".to_string(),
221        path: path.to_string_lossy().to_string(),
222        file_type: FileType::Directory,
223        mime_type: None,
224        size: 0,
225        date: get_creation_date(metadata),
226        sha1: None,
227        md5: None,
228        sha256: None,
229        programming_language: None,
230        package_data: Vec::new(), // TODO: implement
231        license_expression: None,
232        copyrights: Vec::new(),         // TODO: implement
233        license_detections: Vec::new(), // TODO: implement
234        urls: Vec::new(),               // TODO: implement
235        scan_errors: Vec::new(),
236    }
237}