scancode_rust/scanner/
process.rs

1use crate::askalono::{ScanStrategy, TextData};
2use crate::models::{FileInfo, FileInfoBuilder, FileType, LicenseDetection, Match};
3use crate::parsers::{NpmParser, PackageParser};
4use crate::scanner::ProcessResult;
5use crate::utils::file::{get_creation_date, is_path_excluded};
6use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha256};
7use crate::utils::language::detect_language;
8use anyhow::Error;
9use content_inspector::{ContentType, inspect};
10use glob::Pattern;
11use indicatif::ProgressBar;
12use mime_guess::from_path;
13use rayon::prelude::*;
14use std::fs::{self};
15use std::path::Path;
16use std::sync::Arc;
17
18// License detection threshold - scores above this value are considered a match
19
20pub fn process<P: AsRef<Path>>(
21    path: P,
22    max_depth: usize,
23    progress_bar: Arc<ProgressBar>,
24    exclude_patterns: &[Pattern],
25    scan_strategy: &ScanStrategy,
26) -> Result<ProcessResult, Error> {
27    let path = path.as_ref();
28
29    if is_path_excluded(path, exclude_patterns) {
30        return Ok(ProcessResult {
31            files: Vec::new(),
32            excluded_count: 1,
33        });
34    }
35
36    let mut all_files = Vec::new();
37    let mut total_excluded = 0;
38
39    // Read directory entries and group by exclusion status and type
40    let entries: Vec<_> = fs::read_dir(path)?.filter_map(Result::ok).collect();
41
42    let mut file_entries = Vec::new();
43    let mut dir_entries = Vec::new();
44
45    for entry in entries {
46        let path = entry.path();
47
48        // Check exclusion only once per path
49        if is_path_excluded(&path, exclude_patterns) {
50            total_excluded += 1;
51            continue;
52        }
53
54        match fs::metadata(&path) {
55            Ok(metadata) if metadata.is_file() => file_entries.push((path, metadata)),
56            Ok(metadata) if path.is_dir() => dir_entries.push((path, metadata)),
57            _ => continue,
58        }
59    }
60
61    // Process files in parallel
62    all_files.append(
63        &mut file_entries
64            .par_iter()
65            .map(|(path, metadata)| {
66                let file_entry = process_file(path, metadata, scan_strategy);
67                progress_bar.inc(1);
68                file_entry
69            })
70            .collect(),
71    );
72
73    // Process directories
74    for (path, metadata) in dir_entries {
75        all_files.push(process_directory(&path, &metadata));
76
77        if max_depth > 0 {
78            match process(
79                &path,
80                max_depth - 1,
81                progress_bar.clone(),
82                exclude_patterns,
83                scan_strategy,
84            ) {
85                Ok(mut result) => {
86                    all_files.append(&mut result.files);
87                    total_excluded += result.excluded_count;
88                }
89                Err(e) => eprintln!("Error processing directory {}: {}", path.display(), e),
90            }
91        }
92    }
93
94    Ok(ProcessResult {
95        files: all_files,
96        excluded_count: total_excluded,
97    })
98}
99
100fn process_file(path: &Path, metadata: &fs::Metadata, scan_strategy: &ScanStrategy) -> FileInfo {
101    let mut scan_errors: Vec<String> = vec![];
102    let mut file_info_builder = FileInfoBuilder::default();
103
104    if let Err(e) = extract_information_from_content(&mut file_info_builder, path, scan_strategy) {
105        scan_errors.push(e.to_string());
106    };
107
108    return file_info_builder
109        .name(path.file_name().unwrap().to_string_lossy().to_string())
110        .base_name(
111            path.file_stem()
112                .unwrap_or_default()
113                .to_string_lossy()
114                .to_string(),
115        )
116        .extension(
117            path.extension()
118                .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
119        )
120        .path(path.to_string_lossy().to_string())
121        .file_type(FileType::File)
122        .mime_type(Some(
123            from_path(path)
124                .first_or_octet_stream()
125                .essence_str()
126                .to_string(),
127        ))
128        .size(metadata.len())
129        .date(get_creation_date(metadata))
130        .scan_errors(scan_errors)
131        .build()
132        .expect("FileInformationBuild not completely initialized");
133}
134
135fn extract_information_from_content(
136    file_info_builder: &mut FileInfoBuilder,
137    path: &Path,
138    scan_strategy: &ScanStrategy,
139) -> Result<(), Error> {
140    let buffer = fs::read(path)?;
141
142    file_info_builder
143        .sha1(Some(calculate_sha1(&buffer)))
144        .md5(Some(calculate_md5(&buffer)))
145        .sha256(Some(calculate_sha256(&buffer)))
146        .programming_language(Some(detect_language(path, &buffer)));
147
148    if NpmParser::is_match(path) {
149        let package_data = vec![NpmParser::extract_package_data(path)];
150        file_info_builder.package_data(package_data);
151        Ok(())
152    } else if inspect(&buffer) == ContentType::UTF_8 {
153        extract_license_information(
154            file_info_builder,
155            String::from_utf8_lossy(&buffer).into_owned(),
156            scan_strategy,
157        )
158    } else {
159        Ok(())
160    }
161}
162
163fn extract_license_information(
164    file_info_builder: &mut FileInfoBuilder,
165    text_content: String,
166    scan_strategy: &ScanStrategy,
167) -> Result<(), Error> {
168    // Analyze license with the text content
169    if text_content.is_empty() {
170        return Ok(());
171    }
172
173    let license_result = scan_strategy.scan(&TextData::from(text_content.as_str()))?;
174    let license_expr = license_result
175        .license
176        .and_then(|x| Some(x.name.to_string()));
177
178    let license_detections = license_result
179        .containing
180        .iter()
181        .map(|detection| LicenseDetection {
182            license_expression: detection.license.name.to_string(),
183            matches: vec![Match {
184                score: detection.score as f64,
185                start_line: detection.line_range.0,
186                end_line: detection.line_range.1,
187                license_expression: detection.license.name.to_string(),
188                matched_text: None, //TODO
189                rule_identifier: None,
190            }],
191        })
192        .collect::<Vec<_>>();
193
194    file_info_builder
195        .license_expression(license_expr)
196        .license_detections(license_detections);
197
198    Ok(())
199}
200
201fn process_directory(path: &Path, metadata: &fs::Metadata) -> FileInfo {
202    let name = path
203        .file_name()
204        .unwrap_or_default()
205        .to_string_lossy()
206        .to_string();
207    let base_name = name.clone(); // For directories, base_name is the same as name
208
209    FileInfo {
210        name,
211        base_name,
212        extension: "".to_string(),
213        path: path.to_string_lossy().to_string(),
214        file_type: FileType::Directory,
215        mime_type: None,
216        size: 0,
217        date: get_creation_date(metadata),
218        sha1: None,
219        md5: None,
220        sha256: None,
221        programming_language: None,
222        package_data: Vec::new(), // TODO: implement
223        license_expression: None,
224        copyrights: Vec::new(),         // TODO: implement
225        license_detections: Vec::new(), // TODO: implement
226        urls: Vec::new(),               // TODO: implement
227        scan_errors: Vec::new(),
228    }
229}