1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::try_parse_file;
3use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha256};
4use crate::utils::language::detect_language;
5use crate::utils::text::{is_source, remove_verbatim_escape_sequences};
6use anyhow::Error;
7use mime_guess::from_path;
8use rayon::prelude::*;
9use std::fs::{self};
10use std::path::Path;
11use std::sync::Arc;
12use std::time::{Duration, Instant};
13
14use crate::cache::{CachedScanFindings, read_cached_findings, write_cached_findings};
15use crate::copyright::{
16 self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
17};
18use crate::finder::{self, DetectionConfig};
19use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
20use crate::license_detection::query::Query;
21use crate::models::{
22 Author, Copyright, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection, Match,
23 OutputEmail, OutputURL,
24};
25use crate::progress::ScanProgress;
26use crate::scanner::collect::CollectedPaths;
27use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
28use crate::utils::file::{ExtractedTextKind, extract_text_for_detection, get_creation_date};
29use crate::utils::generated::generated_code_hints_from_bytes;
30
31const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
32 ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
33 (
34 "-----BEGIN TRUSTED CERTIFICATE-----",
35 "-----END TRUSTED CERTIFICATE-----",
36 ),
37];
38
39pub fn process_collected(
40 collected: &CollectedPaths,
41 progress: Arc<ScanProgress>,
42 license_engine: Option<Arc<LicenseDetectionEngine>>,
43 license_options: LicenseScanOptions,
44 text_options: &TextDetectionOptions,
45) -> ProcessResult {
46 let mut all_files: Vec<FileInfo> = collected
47 .files
48 .par_iter()
49 .map(|(path, metadata)| {
50 let file_entry = process_file(
51 path,
52 metadata,
53 license_engine.clone(),
54 license_options,
55 text_options,
56 );
57 progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
58 file_entry
59 })
60 .collect();
61
62 for (path, metadata) in &collected.directories {
63 all_files.push(process_directory(
64 path,
65 metadata,
66 text_options.collect_info,
67 license_engine.is_some(),
68 ));
69 }
70
71 ProcessResult {
72 files: all_files,
73 excluded_count: collected.excluded_count,
74 }
75}
76
77fn process_file(
78 path: &Path,
79 metadata: &fs::Metadata,
80 license_engine: Option<Arc<LicenseDetectionEngine>>,
81 license_options: LicenseScanOptions,
82 text_options: &TextDetectionOptions,
83) -> FileInfo {
84 let mut scan_errors: Vec<String> = vec![];
85 let mut file_info_builder = FileInfoBuilder::default();
86 let license_enabled = license_engine.is_some();
87
88 let started = Instant::now();
89
90 let mut generated_flag = None;
91 match extract_information_from_content(
92 &mut file_info_builder,
93 &mut scan_errors,
94 path,
95 license_engine,
96 license_options,
97 text_options,
98 ) {
99 Ok(is_generated) => generated_flag = is_generated,
100 Err(e) => scan_errors.push(e.to_string()),
101 };
102
103 if is_timeout_exceeded(started, text_options.timeout_seconds) {
104 scan_errors.push(format!(
105 "Processing interrupted due to timeout after {:.2} seconds",
106 text_options.timeout_seconds
107 ));
108 }
109
110 let mut file_info = file_info_builder
111 .name(path.file_name().unwrap().to_string_lossy().to_string())
112 .base_name(
113 path.file_stem()
114 .unwrap_or_default()
115 .to_string_lossy()
116 .to_string(),
117 )
118 .extension(
119 path.extension()
120 .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
121 )
122 .path(path.to_string_lossy().to_string())
123 .file_type(FileType::File)
124 .mime_type(Some(
125 from_path(path)
126 .first_or_octet_stream()
127 .essence_str()
128 .to_string(),
129 ))
130 .size(metadata.len())
131 .date(get_creation_date(metadata))
132 .scan_errors(scan_errors)
133 .build()
134 .expect("FileInformationBuild not completely initialized");
135
136 if text_options.collect_info {
137 file_info.is_source = Some(is_source(path));
138 }
139
140 if file_info.programming_language.as_deref() == Some("Go")
141 && is_go_non_production_source(path).unwrap_or(false)
142 {
143 file_info.is_source = Some(false);
144 }
145
146 if text_options.detect_generated {
147 file_info.is_generated = Some(generated_flag.unwrap_or(false));
148 }
149
150 if file_info.percentage_of_license_text.is_none() && license_enabled {
151 file_info.percentage_of_license_text = Some(0.0);
152 }
153
154 if let (Some(scan_results_dir), Some(sha256)) = (
155 text_options.scan_cache_dir.as_deref(),
156 file_info.sha256.as_deref(),
157 ) && file_info.scan_errors.is_empty()
158 {
159 let findings = CachedScanFindings::from_file_info(&file_info);
160 let options_fingerprint =
161 scan_cache_fingerprint(text_options, license_options, license_enabled);
162 if let Err(err) =
163 write_cached_findings(scan_results_dir, sha256, &options_fingerprint, &findings)
164 {
165 file_info
166 .scan_errors
167 .push(format!("Failed to write scan cache entry: {err}"));
168 }
169 }
170
171 file_info
172}
173
174fn extract_information_from_content(
175 file_info_builder: &mut FileInfoBuilder,
176 scan_errors: &mut Vec<String>,
177 path: &Path,
178 license_engine: Option<Arc<LicenseDetectionEngine>>,
179 license_options: LicenseScanOptions,
180 text_options: &TextDetectionOptions,
181) -> Result<Option<bool>, Error> {
182 let started = Instant::now();
183 let buffer = fs::read(path)?;
184 let license_enabled = license_engine.is_some();
185
186 if is_timeout_exceeded(started, text_options.timeout_seconds) {
187 return Err(Error::msg(format!(
188 "Timeout while reading file content (> {:.2}s)",
189 text_options.timeout_seconds
190 )));
191 }
192
193 let sha256 = calculate_sha256(&buffer);
194 let is_generated = text_options
195 .detect_generated
196 .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
197
198 file_info_builder
199 .sha1(Some(calculate_sha1(&buffer)))
200 .md5(Some(calculate_md5(&buffer)))
201 .sha256(Some(sha256.clone()))
202 .programming_language(Some(detect_language(path, &buffer)));
203
204 if should_skip_text_detection(path, &buffer) {
205 return Ok(is_generated);
206 }
207
208 if let Some(scan_results_dir) = text_options.scan_cache_dir.as_deref() {
209 let options_fingerprint =
210 scan_cache_fingerprint(text_options, license_options, license_enabled);
211 match read_cached_findings(scan_results_dir, &sha256, &options_fingerprint) {
212 Ok(Some(findings)) => {
213 file_info_builder
214 .package_data(findings.package_data)
215 .license_expression(findings.license_expression)
216 .license_detections(findings.license_detections)
217 .license_clues(findings.license_clues)
218 .percentage_of_license_text(findings.percentage_of_license_text)
219 .copyrights(findings.copyrights)
220 .holders(findings.holders)
221 .authors(findings.authors)
222 .emails(findings.emails)
223 .urls(findings.urls)
224 .programming_language(findings.programming_language);
225 return Ok(is_generated);
226 }
227 Ok(None) => {}
228 Err(err) => {
229 scan_errors.push(format!("Failed to read scan cache for {:?}: {}", path, err));
230 }
231 }
232 }
233
234 if text_options.detect_packages
237 && let Some(parse_result) = try_parse_file(path)
238 {
239 file_info_builder.package_data(parse_result.packages);
240 scan_errors.extend(parse_result.scan_errors);
241 }
242
243 if is_timeout_exceeded(started, text_options.timeout_seconds) {
244 return Err(Error::msg(format!(
245 "Timeout while extracting package/text metadata (> {:.2}s)",
246 text_options.timeout_seconds
247 )));
248 }
249
250 let (text_content, text_kind) = extract_text_for_detection(path, &buffer);
251 let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
252
253 if is_timeout_exceeded(started, text_options.timeout_seconds) {
254 return Err(Error::msg(format!(
255 "Timeout while extracting text content (> {:.2}s)",
256 text_options.timeout_seconds
257 )));
258 }
259
260 if text_content.is_empty() {
261 return Ok(is_generated);
262 }
263
264 if text_options.detect_copyrights {
265 extract_copyright_information(
266 file_info_builder,
267 path,
268 &text_content,
269 text_options.timeout_seconds,
270 from_binary_strings,
271 );
272 }
273 extract_email_url_information(file_info_builder, &text_content, text_options);
274
275 if is_timeout_exceeded(started, text_options.timeout_seconds) {
276 return Err(Error::msg(format!(
277 "Timeout before license scan (> {:.2}s)",
278 text_options.timeout_seconds
279 )));
280 }
281 let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
283 if let Some(sourcemap_content) =
284 crate::utils::sourcemap::extract_sourcemap_content(&text_content)
285 {
286 sourcemap_content
287 } else {
288 text_content
289 }
290 } else if is_source(path) {
291 remove_verbatim_escape_sequences(&text_content)
292 } else {
293 text_content
294 };
295
296 extract_license_information(
297 file_info_builder,
298 scan_errors,
299 path,
300 text_content_for_license_detection,
301 license_engine,
302 license_options,
303 from_binary_strings,
304 )?;
305
306 Ok(is_generated)
307}
308
309fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
310 timeout_seconds.is_finite()
311 && timeout_seconds > 0.0
312 && started.elapsed().as_secs_f64() > timeout_seconds
313}
314
315fn scan_cache_fingerprint(
316 text_options: &TextDetectionOptions,
317 license_options: LicenseScanOptions,
318 license_enabled: bool,
319) -> String {
320 format!(
321 "packages={};copyrights={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={}",
322 text_options.detect_packages,
323 text_options.detect_copyrights,
324 text_options.detect_emails,
325 text_options.detect_urls,
326 text_options.max_emails,
327 text_options.max_urls,
328 text_options.timeout_seconds,
329 license_enabled,
330 license_options.include_text,
331 license_options.include_text_diagnostics,
332 license_options.include_diagnostics,
333 license_options.unknown_licenses,
334 )
335}
336
337fn extract_copyright_information(
338 file_info_builder: &mut FileInfoBuilder,
339 path: &Path,
340 text_content: &str,
341 timeout_seconds: f64,
342 from_binary_strings: bool,
343) {
344 if copyright::is_credits_file(path) {
346 let author_detections = copyright::detect_credits_authors(text_content);
347 if !author_detections.is_empty() {
348 file_info_builder.authors(
349 author_detections
350 .into_iter()
351 .map(|a| Author {
352 author: a.author,
353 start_line: a.start_line,
354 end_line: a.end_line,
355 })
356 .collect(),
357 );
358 return;
359 }
360 }
361
362 let copyright_options = CopyrightDetectionOptions {
363 max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
364 Some(Duration::from_secs_f64(timeout_seconds))
365 } else {
366 None
367 },
368 ..CopyrightDetectionOptions::default()
369 };
370
371 let (copyrights, holders, authors) =
372 copyright::detect_copyrights_with_options(text_content, ©right_options);
373 let (copyrights, holders, authors) = if from_binary_strings {
374 prune_binary_string_detections(copyrights, holders, authors)
375 } else {
376 (copyrights, holders, authors)
377 };
378
379 file_info_builder.copyrights(
380 copyrights
381 .into_iter()
382 .map(|c| Copyright {
383 copyright: c.copyright,
384 start_line: c.start_line,
385 end_line: c.end_line,
386 })
387 .collect::<Vec<Copyright>>(),
388 );
389 file_info_builder.holders(
390 holders
391 .into_iter()
392 .map(|h| Holder {
393 holder: h.holder,
394 start_line: h.start_line,
395 end_line: h.end_line,
396 })
397 .collect::<Vec<Holder>>(),
398 );
399 file_info_builder.authors(
400 authors
401 .into_iter()
402 .map(|a| Author {
403 author: a.author,
404 start_line: a.start_line,
405 end_line: a.end_line,
406 })
407 .collect::<Vec<Author>>(),
408 );
409}
410
411fn prune_binary_string_detections(
412 copyrights: Vec<CopyrightDetection>,
413 holders: Vec<HolderDetection>,
414 _authors: Vec<AuthorDetection>,
415) -> (
416 Vec<CopyrightDetection>,
417 Vec<HolderDetection>,
418 Vec<AuthorDetection>,
419) {
420 let kept_copyrights: Vec<CopyrightDetection> = copyrights
421 .into_iter()
422 .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
423 .collect();
424
425 let kept_holders: Vec<HolderDetection> = holders
426 .into_iter()
427 .filter(|holder| {
428 kept_copyrights.iter().any(|copyright| {
429 ranges_overlap(
430 holder.start_line,
431 holder.end_line,
432 copyright.start_line,
433 copyright.end_line,
434 )
435 })
436 })
437 .collect();
438
439 (kept_copyrights, kept_holders, Vec::new())
440}
441
442fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
443 a_start <= b_end && b_start <= a_end
444}
445
446fn is_binary_string_copyright_candidate(text: &str) -> bool {
447 if has_explicit_copyright_marker(text) || contains_year(text) {
448 return true;
449 }
450
451 let lower = text.to_ascii_lowercase();
452 let Some(tail) = lower.strip_prefix("copyright") else {
453 return true;
454 };
455 let tail = tail.trim();
456 let alpha_tokens: Vec<&str> = tail
457 .split_whitespace()
458 .filter(|token| token.chars().any(|c| c.is_alphabetic()))
459 .collect();
460
461 if alpha_tokens.len() <= 1 {
462 return true;
463 }
464
465 if tail.contains(',') || tail.contains(" and ") || tail.contains('&') {
466 return true;
467 }
468
469 alpha_tokens
470 .iter()
471 .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
472}
473
474fn has_explicit_copyright_marker(text: &str) -> bool {
475 let lower = text.to_ascii_lowercase();
476 lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
477}
478
479fn contains_year(text: &str) -> bool {
480 let bytes = text.as_bytes();
481 bytes.windows(4).any(|window| {
482 window.iter().all(|b| b.is_ascii_digit())
483 && matches!(window[0], b'1' | b'2')
484 && matches!(window[1], b'9' | b'0')
485 })
486}
487
488fn is_company_like_suffix(token: &str) -> bool {
489 matches!(
490 token.to_ascii_lowercase().as_str(),
491 "inc"
492 | "corp"
493 | "corporation"
494 | "co"
495 | "company"
496 | "ltd"
497 | "llc"
498 | "gmbh"
499 | "foundation"
500 | "project"
501 | "systems"
502 | "software"
503 | "technologies"
504 | "technology"
505 )
506}
507
508fn extract_email_url_information(
509 file_info_builder: &mut FileInfoBuilder,
510 text_content: &str,
511 text_options: &TextDetectionOptions,
512) {
513 if !text_options.detect_emails && !text_options.detect_urls {
514 return;
515 }
516
517 if text_options.detect_emails {
518 let config = DetectionConfig {
519 max_emails: text_options.max_emails,
520 max_urls: text_options.max_urls,
521 unique: false,
522 };
523 let emails = finder::find_emails(text_content, &config)
524 .into_iter()
525 .map(|d| OutputEmail {
526 email: d.email,
527 start_line: d.start_line,
528 end_line: d.end_line,
529 })
530 .collect::<Vec<_>>();
531 file_info_builder.emails(emails);
532 }
533
534 if text_options.detect_urls {
535 let config = DetectionConfig {
536 max_emails: text_options.max_emails,
537 max_urls: text_options.max_urls,
538 unique: true,
539 };
540 let urls = finder::find_urls(text_content, &config)
541 .into_iter()
542 .map(|d| OutputURL {
543 url: d.url,
544 start_line: d.start_line,
545 end_line: d.end_line,
546 })
547 .collect::<Vec<_>>();
548 file_info_builder.urls(urls);
549 }
550}
551
552fn extract_license_information(
553 file_info_builder: &mut FileInfoBuilder,
554 scan_errors: &mut Vec<String>,
555 path: &Path,
556 text_content: String,
557 license_engine: Option<Arc<LicenseDetectionEngine>>,
558 license_options: LicenseScanOptions,
559 from_binary_strings: bool,
560) -> Result<(), Error> {
561 let Some(engine) = license_engine else {
562 return Ok(());
563 };
564
565 match engine.detect_with_kind_and_source(
566 &text_content,
567 license_options.unknown_licenses,
568 from_binary_strings,
569 &path.to_string_lossy(),
570 ) {
571 Ok(detections) => {
572 let query =
573 Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
574 let mut model_detections = Vec::new();
575 let mut model_clues = Vec::new();
576
577 for detection in &detections {
578 let (public_detection, clue_matches) = convert_detection_to_model(
579 detection,
580 license_options,
581 &text_content,
582 query.as_ref(),
583 );
584
585 if let Some(public_detection) = public_detection {
586 model_detections.push(public_detection);
587 }
588
589 model_clues.extend(clue_matches);
590 }
591
592 if !model_detections.is_empty() {
593 let expressions: Vec<String> = model_detections
594 .iter()
595 .filter(|d| !d.license_expression_spdx.is_empty())
596 .map(|d| d.license_expression_spdx.clone())
597 .collect();
598
599 if !expressions.is_empty() {
600 let combined = crate::utils::spdx::combine_license_expressions(expressions);
601 if let Some(expr) = combined {
602 file_info_builder.license_expression(Some(expr));
603 }
604 }
605 }
606
607 file_info_builder.license_detections(model_detections);
608 file_info_builder.license_clues(model_clues);
609 file_info_builder.percentage_of_license_text(
610 query
611 .as_ref()
612 .map(|query| compute_percentage_of_license_text(query, &detections)),
613 );
614 }
615 Err(e) => {
616 scan_errors.push(format!("License detection failed: {}", e));
617 }
618 }
619
620 Ok(())
621}
622
623fn convert_detection_to_model(
624 detection: &crate::license_detection::LicenseDetection,
625 license_options: LicenseScanOptions,
626 text_content: &str,
627 query: Option<&Query<'_>>,
628) -> (Option<LicenseDetection>, Vec<Match>) {
629 let matches: Vec<Match> = detection
630 .matches
631 .iter()
632 .map(|m| convert_match_to_model(m, license_options, text_content, query))
633 .collect();
634
635 if let Some(license_expression) = detection.license_expression.clone() {
636 (
637 Some(LicenseDetection {
638 license_expression,
639 license_expression_spdx: detection
640 .license_expression_spdx
641 .clone()
642 .unwrap_or_default(),
643 matches,
644 detection_log: if license_options.include_diagnostics {
645 detection.detection_log.clone()
646 } else {
647 Vec::new()
648 },
649 identifier: detection.identifier.clone(),
650 }),
651 Vec::new(),
652 )
653 } else {
654 (None, matches)
655 }
656}
657
658fn convert_match_to_model(
659 m: &crate::license_detection::models::LicenseMatch,
660 license_options: LicenseScanOptions,
661 text_content: &str,
662 query: Option<&Query<'_>>,
663) -> Match {
664 let rule_url = if m.rule_url.is_empty() {
665 None
666 } else {
667 Some(m.rule_url.clone())
668 };
669 let matched_text = if license_options.include_text {
670 m.matched_text.clone().or_else(|| {
671 Some(crate::license_detection::query::matched_text_from_text(
672 text_content,
673 m.start_line,
674 m.end_line,
675 ))
676 })
677 } else {
678 None
679 };
680 let matched_text_diagnostics = if license_options.include_text_diagnostics {
681 query.map(|query| matched_text_diagnostics_from_match(query, m))
682 } else {
683 None
684 };
685 Match {
686 license_expression: m.license_expression.clone(),
687 license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
688 from_file: m.from_file.clone(),
689 start_line: m.start_line,
690 end_line: m.end_line,
691 matcher: Some(m.matcher.to_string()),
692 score: m.score as f64,
693 matched_length: Some(m.matched_length),
694 match_coverage: Some(m.match_coverage as f64),
695 rule_relevance: Some(m.rule_relevance as usize),
696 rule_identifier: Some(m.rule_identifier.clone()),
697 rule_url,
698 matched_text,
699 referenced_filenames: m.referenced_filenames.clone(),
700 matched_text_diagnostics,
701 }
702}
703
704fn compute_percentage_of_license_text(
705 query: &Query<'_>,
706 detections: &[crate::license_detection::LicenseDetection],
707) -> f64 {
708 let matched_positions: std::collections::HashSet<usize> = detections
709 .iter()
710 .flat_map(|detection| detection.matches.iter())
711 .flat_map(InternalLicenseMatch::qspan)
712 .collect();
713
714 let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
715 if query_tokens_length == 0 {
716 return 0.0;
717 }
718
719 let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
720 (percentage * 100.0).round() / 100.0
721}
722
723fn matched_text_diagnostics_from_match(
724 query: &Query<'_>,
725 license_match: &InternalLicenseMatch,
726) -> String {
727 let matched_positions: std::collections::HashSet<usize> =
728 license_match.qspan().into_iter().collect();
729 let Some(start_pos) = matched_positions.iter().min().copied() else {
730 return crate::license_detection::query::matched_text_from_text(
731 &query.text,
732 license_match.start_line,
733 license_match.end_line,
734 );
735 };
736 let Some(end_pos) = matched_positions.iter().max().copied() else {
737 return crate::license_detection::query::matched_text_from_text(
738 &query.text,
739 license_match.start_line,
740 license_match.end_line,
741 );
742 };
743
744 crate::license_detection::query::matched_text_diagnostics_from_text(
745 &query.text,
746 query,
747 &matched_positions,
748 start_pos,
749 end_pos,
750 license_match.start_line,
751 license_match.end_line,
752 )
753}
754
755fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
756 is_pem_certificate_file(path, buffer)
757}
758
759fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
760 if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
761 return Ok(false);
762 }
763
764 if path
765 .file_name()
766 .and_then(|name| name.to_str())
767 .is_some_and(|name| name.ends_with("_test.go"))
768 {
769 return Ok(true);
770 }
771
772 let content = fs::read_to_string(path)?;
773 Ok(content.lines().take(10).any(|line| {
774 let trimmed = line.trim();
775 (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
776 && trimmed.split_whitespace().any(|token| token == "test")
777 }))
778}
779
780fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
781 let prefix_len = buffer.len().min(8192);
782 let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
783 let trimmed_lines: Vec<&str> = prefix
784 .lines()
785 .map(str::trim)
786 .filter(|line| !line.is_empty())
787 .take(64)
788 .collect();
789
790 PEM_CERTIFICATE_HEADERS.iter().any(|(begin, end)| {
791 trimmed_lines.iter().any(|line| line == begin)
792 && trimmed_lines.iter().any(|line| line == end)
793 })
794}
795
796fn process_directory(
797 path: &Path,
798 metadata: &fs::Metadata,
799 collect_info: bool,
800 license_enabled: bool,
801) -> FileInfo {
802 let name = path
803 .file_name()
804 .unwrap_or_default()
805 .to_string_lossy()
806 .to_string();
807 let base_name = name.clone(); FileInfo {
810 name,
811 base_name,
812 extension: "".to_string(),
813 path: path.to_string_lossy().to_string(),
814 file_type: FileType::Directory,
815 mime_type: None,
816 size: 0,
817 date: get_creation_date(metadata),
818 sha1: None,
819 md5: None,
820 sha256: None,
821 programming_language: None,
822 package_data: Vec::new(), license_expression: None,
824 license_detections: Vec::new(), license_clues: Vec::new(), percentage_of_license_text: license_enabled.then_some(0.0),
827 copyrights: Vec::new(), holders: Vec::new(), authors: Vec::new(), emails: Vec::new(), urls: Vec::new(), for_packages: Vec::new(),
833 scan_errors: Vec::new(),
834 is_source: collect_info.then_some(false),
835 source_count: None,
836 is_legal: false,
837 is_manifest: false,
838 is_readme: false,
839 is_top_level: false,
840 is_key_file: false,
841 is_community: false,
842 is_generated: None,
843 facets: vec![],
844 tallies: None,
845 }
846}
847
848#[cfg(test)]
849mod tests {
850 use super::{
851 compute_percentage_of_license_text, convert_detection_to_model, is_go_non_production_source,
852 };
853 use crate::license_detection::LicenseDetection as InternalLicenseDetection;
854 use crate::license_detection::index::LicenseIndex;
855 use crate::license_detection::index::dictionary::TokenDictionary;
856 use crate::license_detection::models::{LicenseMatch, MatcherKind, RuleKind};
857 use crate::license_detection::query::Query;
858 use crate::scanner::LicenseScanOptions;
859 use std::fs;
860 use tempfile::tempdir;
861
862 fn make_internal_match(rule_url: &str) -> LicenseMatch {
863 LicenseMatch {
864 rid: 0,
865 license_expression: "mit".to_string(),
866 license_expression_spdx: Some("MIT".to_string()),
867 from_file: None,
868 start_line: 1,
869 end_line: 1,
870 start_token: 0,
871 end_token: 1,
872 matcher: MatcherKind::Hash,
873 score: 1.0,
874 matched_length: 3,
875 rule_length: 3,
876 match_coverage: 100.0,
877 rule_relevance: 100,
878 rule_identifier: "mit.LICENSE".to_string(),
879 rule_url: rule_url.to_string(),
880 matched_text: Some("MIT".to_string()),
881 referenced_filenames: None,
882 rule_kind: RuleKind::Text,
883 is_from_license: true,
884 matched_token_positions: None,
885 hilen: 3,
886 rule_start_token: 0,
887 qspan_positions: None,
888 ispan_positions: None,
889 hispan_positions: None,
890 candidate_resemblance: 0.0,
891 candidate_containment: 0.0,
892 }
893 }
894
895 fn make_detection(rule_url: &str) -> InternalLicenseDetection {
896 InternalLicenseDetection {
897 license_expression: Some("mit".to_string()),
898 license_expression_spdx: Some("MIT".to_string()),
899 matches: vec![make_internal_match(rule_url)],
900 detection_log: vec![],
901 identifier: Some("mit-test".to_string()),
902 file_regions: Vec::new(),
903 }
904 }
905
906 fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
907 let dictionary = TokenDictionary::new_with_legalese(entries);
908 let mut index = LicenseIndex::new(dictionary);
909 index.len_legalese = len_legalese;
910 index
911 }
912
913 #[test]
914 fn test_convert_detection_to_model_preserves_rule_url() {
915 let detection = make_detection(
916 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
917 );
918
919 let (converted, clues) =
920 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
921 let converted = converted.expect("detection should convert");
922
923 assert_eq!(
924 converted.matches[0].rule_url.as_deref(),
925 Some(
926 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
927 )
928 );
929 assert!(clues.is_empty());
930 }
931
932 #[test]
933 fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
934 let detection = make_detection("");
935
936 let (converted, clues) =
937 convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
938 let converted = converted.expect("detection should convert");
939
940 assert_eq!(converted.matches[0].rule_url, None);
941 assert!(clues.is_empty());
942 }
943
944 #[test]
945 fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
946 let mut detection = make_detection(
947 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
948 );
949 detection.license_expression = None;
950 detection.license_expression_spdx = None;
951 detection.identifier = None;
952 detection.matches[0].license_expression = "unknown-license-reference".to_string();
953 detection.matches[0].license_expression_spdx =
954 Some("LicenseRef-scancode-unknown-license-reference".to_string());
955 detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
956 detection.matches[0].rule_kind = RuleKind::Clue;
957
958 let (converted, clues) = convert_detection_to_model(
959 &detection,
960 LicenseScanOptions {
961 include_text: true,
962 ..LicenseScanOptions::default()
963 },
964 "clue text",
965 None,
966 );
967
968 assert!(converted.is_none());
969 assert_eq!(clues.len(), 1);
970 assert_eq!(clues[0].license_expression, "unknown-license-reference");
971 assert_eq!(
972 clues[0].license_expression_spdx,
973 "LicenseRef-scancode-unknown-license-reference"
974 );
975 assert_eq!(
976 clues[0].rule_identifier.as_deref(),
977 Some("license-clue_1.RULE")
978 );
979 assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
980 assert_eq!(clues[0].matched_text_diagnostics, None);
981 }
982
983 #[test]
984 fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
985 let text = concat!(
986 "Reproduction and distribution of this file, with or without modification, are\n",
987 "permitted in any medium without royalties provided the copyright notice\n",
988 "and this notice are preserved. This file is offered as-is, without any warranties.\n",
989 );
990 let index = create_test_index(
991 &[
992 ("reproduction", 0),
993 ("distribution", 1),
994 ("file", 2),
995 ("without", 3),
996 ("modification", 4),
997 ("permitted", 5),
998 ("medium", 6),
999 ("royalties", 7),
1000 ("provided", 8),
1001 ("copyright", 9),
1002 ("notice", 10),
1003 ("preserved", 11),
1004 ("offered", 12),
1005 ("warranties", 13),
1006 ],
1007 14,
1008 );
1009 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1010 let mut detection = make_detection(
1011 "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1012 );
1013 detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1014 detection.matches[0].license_expression = "fsf-ap".to_string();
1015 detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1016 detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1017 detection.matches[0].matched_text = None;
1018 detection.matches[0].start_line = 1;
1019 detection.matches[0].end_line = 3;
1020 detection.matches[0].start_token = 0;
1021 detection.matches[0].end_token = query.tokens.len();
1022 detection.matches[0].qspan_positions = Some(
1023 query
1024 .tokens
1025 .iter()
1026 .enumerate()
1027 .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1028 .collect(),
1029 );
1030 detection.identifier = Some("fsf_ap-test".to_string());
1031
1032 let (converted, clues) = convert_detection_to_model(
1033 &detection,
1034 LicenseScanOptions {
1035 include_text: true,
1036 include_text_diagnostics: true,
1037 include_diagnostics: true,
1038 unknown_licenses: false,
1039 },
1040 text,
1041 Some(&query),
1042 );
1043 let converted = converted.expect("detection should convert");
1044
1045 assert!(clues.is_empty());
1046 assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1047 assert_eq!(
1048 converted.matches[0].matched_text.as_deref(),
1049 Some(text.trim_end())
1050 );
1051 let diagnostics = converted.matches[0]
1052 .matched_text_diagnostics
1053 .as_deref()
1054 .expect("diagnostics should be present");
1055 assert!(diagnostics.contains('['));
1056 assert!(diagnostics.contains(']'));
1057 assert_ne!(diagnostics, text.trim_end());
1058 }
1059
1060 #[test]
1061 fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
1062 let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
1063 let text = "alpha MIT omega";
1064 let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1065 let mut detection = make_detection("");
1066 detection.matches[0].qspan_positions = Some(vec![1]);
1067 detection.matches[0].start_token = 1;
1068 detection.matches[0].end_token = 2;
1069
1070 let percentage = compute_percentage_of_license_text(&query, &[detection]);
1071
1072 assert_eq!(percentage, 33.33);
1073 }
1074
1075 #[test]
1076 fn test_is_go_non_production_source_for_test_filename() {
1077 let temp_dir = tempdir().unwrap();
1078 let path = temp_dir.path().join("scanner_test.go");
1079 fs::write(&path, "package scanner\n").unwrap();
1080
1081 assert!(is_go_non_production_source(&path).unwrap());
1082 }
1083
1084 #[test]
1085 fn test_is_go_non_production_source_for_build_tag() {
1086 let temp_dir = tempdir().unwrap();
1087 let path = temp_dir.path().join("scanner.go");
1088 fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
1089
1090 assert!(is_go_non_production_source(&path).unwrap());
1091 }
1092
1093 #[test]
1094 fn test_is_go_non_production_source_for_regular_go_file() {
1095 let temp_dir = tempdir().unwrap();
1096 let path = temp_dir.path().join("scanner.go");
1097 fs::write(&path, "package scanner\n").unwrap();
1098
1099 assert!(!is_go_non_production_source(&path).unwrap());
1100 }
1101}