use std::sync::LazyLock;
use regex::Regex;
use super::hints;
use super::prepare::prepare_text_line;
static CODE_PATTERNS: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
vec![
"function(",
"this.",
".prototype",
"===",
"!==",
"var ",
"return ",
"typeof ",
"undefined",
"null)",
"null,",
".apply(",
".call(",
"addEventListener",
"removeEventListener",
"createElement",
"appendChild",
"innerHTML",
"className",
"setAttribute",
"getAttribute",
]
});
const MAX_LINE_LENGTH: usize = 2_000;
const CODE_LINE_MIN_LENGTH: usize = 200;
const ENCODED_LINE_MIN_LENGTH: usize = 40;
const ENCODED_CHAR_RATIO: f64 = 0.90;
fn has_copyright_indicators(line: &str) -> bool {
let bytes = line.as_bytes();
contains_ascii_ci(bytes, b"opyr")
|| contains_ascii_ci(bytes, b"opyl")
|| contains_ascii_ci(bytes, b"auth")
|| has_c_sign_before_year(bytes)
}
fn contains_ascii_ci(haystack: &[u8], needle: &[u8]) -> bool {
if needle.len() > haystack.len() {
return false;
}
haystack
.windows(needle.len())
.any(|window| window.eq_ignore_ascii_case(needle))
}
fn has_c_sign_before_year(bytes: &[u8]) -> bool {
for (i, window) in bytes.windows(3).enumerate() {
if window[0] == b'(' && (window[1] == b'c' || window[1] == b'C') && window[2] == b')' {
let rest = &bytes[i + 3..];
for &b in rest {
if b == b' ' || b == b'\t' {
continue;
}
if b.is_ascii_digit() {
return true;
}
break;
}
}
}
false
}
fn is_encoded_data_line(line: &str) -> bool {
let len = line.len();
if len < ENCODED_LINE_MIN_LENGTH {
return false;
}
if has_copyright_indicators(line) {
return false;
}
let bytes = line.as_bytes();
if is_uuencode_data_line(bytes) {
return true;
}
if is_base64_data_line(bytes) {
return true;
}
false
}
fn is_uuencode_data_line(bytes: &[u8]) -> bool {
let first = bytes[0];
if !(32..=95).contains(&first) {
return false;
}
let uu_count = bytes.iter().filter(|&&b| (32..=96).contains(&b)).count();
let ratio = uu_count as f64 / bytes.len() as f64;
if ratio < ENCODED_CHAR_RATIO {
return false;
}
let mut seen = [false; 256];
for &b in bytes {
seen[b as usize] = true;
}
let distinct_count = seen.iter().filter(|&&s| s).count();
if distinct_count < 8 {
return false;
}
let space_count = bytes.iter().filter(|&&b| b == b' ').count();
space_count <= 1
}
fn is_base64_data_line(bytes: &[u8]) -> bool {
if bytes.contains(&b' ') {
return false;
}
bytes
.iter()
.all(|&b| b.is_ascii_alphanumeric() || b == b'+' || b == b'/' || b == b'=')
}
fn is_code_line_with_false_c(line: &str) -> bool {
if line.len() < CODE_LINE_MIN_LENGTH {
return false;
}
let lower = line.to_lowercase();
if lower.contains("opyr") || lower.contains("opyl") || lower.contains("auth") {
return false;
}
let code_pattern_count = CODE_PATTERNS.iter().filter(|p| line.contains(**p)).count();
code_pattern_count >= 2
}
fn is_swift_convention_c_signature_line(line: &str) -> bool {
let trimmed = line.trim_start();
trimmed.contains("@convention(c)")
&& !has_copyright_indicators(trimmed)
&& (trimmed.starts_with("let ")
|| trimmed.starts_with("var ")
|| trimmed.starts_with("typealias ")
|| trimmed.contains(" -> "))
}
static NON_CHARS_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[^a-zA-Z0-9]").unwrap());
const CONTINUATION_SUFFIXES: &[&str] = &["copyright", "copyrights", "and", "by"];
const END_SUFFIXES: &[&str] = &["rightreserved", "rightsreserved"];
fn chars_only(s: &str) -> String {
NON_CHARS_RE.replace_all(&s.to_lowercase(), "").into_owned()
}
fn is_end_of_statement(chars: &str) -> bool {
END_SUFFIXES.iter().any(|suffix| chars.ends_with(suffix))
}
pub(crate) fn versioned_banner_holder_from_prepared(prepared: &str) -> Option<String> {
let trimmed = prepared.trim().trim_start_matches('!').trim_start();
let c_idx = trimmed.find("(c)")?;
let (head, tail_with_c) = trimmed.split_at(c_idx);
let tail = tail_with_c["(c)".len()..].trim_start();
let head_tokens: Vec<&str> = head.split_whitespace().collect();
let has_version_token = head_tokens.iter().any(|token| {
let token =
token.trim_matches(|c: char| !c.is_ascii_alphanumeric() && c != '.' && c != '_');
let stripped = token.strip_prefix('v').unwrap_or(token);
!stripped.is_empty()
&& stripped.chars().next().is_some_and(|c| c.is_ascii_digit())
&& stripped.chars().any(|c| c == '.' || c == '-')
});
let has_project_token = head_tokens
.iter()
.any(|token| token.chars().any(|c| c.is_ascii_alphabetic()));
if !(has_version_token && has_project_token) {
return None;
}
let tail_lower = tail.to_ascii_lowercase();
if !(tail_lower.contains(".org/license")
|| tail_lower.contains(".com/license")
|| tail_lower.contains(" licensed")
|| tail_lower.contains(" license")
|| tail_lower.contains(" released under"))
{
return None;
}
let tokens: Vec<&str> = tail.split_whitespace().collect();
let license_token_idx = tokens.iter().enumerate().find_map(|(idx, token)| {
let lower = token.to_ascii_lowercase();
if lower.contains("/license") || lower.contains("/licenses") {
return Some(idx);
}
if lower == "license" || lower == "licenses" || lower == "licensed" {
return Some(idx);
}
if lower == "released"
&& tokens
.get(idx + 1)
.is_some_and(|next| next.eq_ignore_ascii_case("under"))
{
return Some(idx);
}
None
})?;
let holder_raw = tokens[..license_token_idx]
.join(" ")
.trim()
.trim_matches(|c: char| c == '|' || c == ';' || c == ',' || c.is_whitespace())
.to_string();
if holder_raw.split_whitespace().count() < 2 {
return None;
}
Some(holder_raw)
}
pub(crate) fn is_raw_versioned_project_banner_line(line: &str) -> bool {
line.trim_start().starts_with("/*!")
&& versioned_banner_holder_from_prepared(&prepare_text_line(line)).is_some()
}
fn ends_with_continuation(chars: &str) -> bool {
if chars.is_empty() {
return false;
}
CONTINUATION_SUFFIXES
.iter()
.any(|suffix| chars.ends_with(suffix))
|| hints::has_trailing_year(chars)
}
fn is_tabular_noise_line(line: &str) -> bool {
if !line.contains('\t') {
return false;
}
if !line.contains("--") {
return false;
}
let bytes = line.as_bytes();
for i in 1..bytes.len().saturating_sub(1) {
if bytes[i] == b'/' && bytes[i - 1].is_ascii_digit() && bytes[i + 1].is_ascii_digit() {
return true;
}
}
false
}
fn is_noncopyright_at_directive_line(prepared: &str) -> bool {
let trimmed = prepared.trim_start();
let mut chars = trimmed.chars();
let starts_with_lowercase_directive =
matches!(chars.next(), Some('@')) && chars.next().is_some_and(|c| c.is_ascii_lowercase());
starts_with_lowercase_directive
&& !hints::has_year(trimmed)
&& !trimmed.contains("://")
&& !has_copyright_indicators(trimmed)
}
pub type NumberedLine = (usize, String);
pub fn collect_candidate_lines<T>(
numbered_lines: impl IntoIterator<Item = (usize, T)>,
) -> Vec<Vec<NumberedLine>>
where
T: AsRef<str>,
{
let mut groups: Vec<Vec<NumberedLine>> = Vec::new();
let mut candidates: Vec<NumberedLine> = Vec::new();
let mut in_copyright: u32 = 0;
let mut previous_chars: Option<String> = None;
let mut debian_like: bool = false;
let mut debian_header_only_copyright_next_copy_needs_prefix: bool = false;
let mut prev_prepared_is_copy_start_with_year = false;
for (ln, line) in numbered_lines {
let line = line.as_ref();
let lower_trim = line.trim_start();
let lower_trim = lower_trim
.trim_start_matches(['*', '/', '#', ';', '!'])
.trim_start()
.to_ascii_lowercase();
let is_author_header_line = (lower_trim == "authors"
|| lower_trim.starts_with("authors:")
|| lower_trim.starts_with("author(s):")
|| lower_trim.starts_with("author:")
|| lower_trim.starts_with("author(s)"))
&& !hints::has_year(line)
&& !lower_trim.contains("copyright")
&& !lower_trim.contains("(c)")
&& !lower_trim.contains("copr");
if is_author_header_line {
if !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
}
in_copyright = 0;
previous_chars = None;
prev_prepared_is_copy_start_with_year = false;
continue;
}
if lower_trim.starts_with("format-specification:") || lower_trim.starts_with("files:") {
debian_like = true;
}
if lower_trim.starts_with("files:") || lower_trim.starts_with("license:") {
debian_header_only_copyright_next_copy_needs_prefix = false;
}
if line.len() > MAX_LINE_LENGTH && !has_copyright_indicators(line) {
if in_copyright > 0 {
in_copyright -= 1;
if in_copyright == 0 && !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
previous_chars = None;
}
}
continue;
}
if is_encoded_data_line(line) {
if in_copyright > 0 {
in_copyright -= 1;
if in_copyright == 0 && !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
previous_chars = None;
}
}
continue;
}
let co = chars_only(line);
if is_end_of_statement(&co) {
let prepared = prepare_text_line(line);
let prepared_is_copy_start_with_year = is_copy_marker_start(prepared.trim_start());
if prepared_is_copy_start_with_year && prev_prepared_is_copy_start_with_year {
let prev_is_year_only = candidates
.last()
.is_some_and(|(_, prev)| is_year_only_copy_marker_line(prev.as_str()));
let current_is_year_only = is_year_only_copy_marker_line(prepared.as_str());
if !prev_is_year_only && !current_is_year_only && !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
}
}
if debian_like
&& is_copy_marker_start(prepared.trim_start())
&& candidates
.last()
.is_some_and(|(_, prev)| is_copy_marker_start(prev.trim_start()))
{
groups.push(std::mem::take(&mut candidates));
}
candidates.push((ln, prepared));
groups.push(std::mem::take(&mut candidates));
in_copyright = 0;
previous_chars = None;
prev_prepared_is_copy_start_with_year = prepared_is_copy_start_with_year;
} else if hints::is_candidate(line)
|| co.contains("http")
|| is_raw_versioned_project_banner_line(line)
{
if is_swift_convention_c_signature_line(line) {
continue;
}
if is_code_line_with_false_c(line) {
continue;
}
let prepared = prepare_text_line(line);
if is_noncopyright_at_directive_line(&prepared) {
if !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
}
in_copyright = 0;
previous_chars = None;
prev_prepared_is_copy_start_with_year = false;
continue;
}
let prepared_chars = chars_only(&prepared);
let prepared_is_copy_start_with_year = is_copy_marker_start(prepared.trim_start());
if prepared_is_copy_start_with_year && prev_prepared_is_copy_start_with_year {
let prev_is_year_only = candidates
.last()
.is_some_and(|(_, prev)| is_year_only_copy_marker_line(prev.as_str()));
let current_is_year_only = is_year_only_copy_marker_line(prepared.as_str());
if !prev_is_year_only && !current_is_year_only && !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
}
}
if debian_like
&& (prepared_chars == "copyright" || prepared_chars == "copyrights")
&& prepared
.split_once(':')
.map(|(_, tail)| tail.trim())
.is_some_and(|tail| tail.is_empty())
{
debian_header_only_copyright_next_copy_needs_prefix = true;
}
if debian_like
&& is_copy_marker_start(prepared.trim_start())
&& candidates
.last()
.is_some_and(|(_, prev)| is_copy_marker_start(prev.trim_start()))
{
groups.push(std::mem::take(&mut candidates));
}
if is_standalone_comment_line(line)
&& versioned_banner_holder_from_prepared(&prepared).is_some()
{
candidates.push((ln, prepared));
groups.push(std::mem::take(&mut candidates));
in_copyright = 0;
previous_chars = None;
prev_prepared_is_copy_start_with_year = prepared_is_copy_start_with_year;
continue;
}
if (prepared_chars == "copyright" || prepared_chars == "copyrights")
&& prepared
.split_once(':')
.map(|(_, tail)| tail.trim())
.is_some_and(|tail| {
!tail.is_empty() && tail.chars().all(|c| !c.is_ascii_alphanumeric())
})
{
in_copyright = 2;
previous_chars = Some(prepared_chars);
continue;
}
in_copyright = 2;
if debian_header_only_copyright_next_copy_needs_prefix {
let trimmed = prepared.trim_start();
let lower = trimmed.to_ascii_lowercase();
let is_copy_start = starts_with_c_sign(trimmed) || lower.starts_with("copyright");
if is_copy_start && hints::has_year(trimmed) {
debian_header_only_copyright_next_copy_needs_prefix = false;
if starts_with_c_sign(trimmed) {
let prefixed = format!("Copyright {trimmed}");
let prefixed_chars = chars_only(&prefixed);
candidates.push((ln, prefixed));
previous_chars = Some(prefixed_chars);
continue;
}
}
}
candidates.push((ln, prepared));
previous_chars = Some(prepared_chars);
prev_prepared_is_copy_start_with_year = prepared_is_copy_start_with_year;
} else if in_copyright > 0 {
let prepared = prepare_text_line(line);
let trimmed = prepared.trim_start();
let lower = trimmed.to_ascii_lowercase();
if is_noncopyright_at_directive_line(trimmed) {
if !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
}
in_copyright = 0;
previous_chars = None;
prev_prepared_is_copy_start_with_year = false;
continue;
}
let is_author_header = (lower == "authors"
|| lower.starts_with("authors:")
|| lower.starts_with("author(s):")
|| lower.starts_with("author:")
|| lower.starts_with("author(s)"))
&& !hints::has_year(trimmed)
&& !lower.contains("copyright")
&& !lower.contains("(c)")
&& !lower.contains("copr");
if is_author_header {
if !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
}
in_copyright = 0;
previous_chars = None;
prev_prepared_is_copy_start_with_year = false;
continue;
}
if is_obvious_code_line(line) && !has_copyright_indicators(line) {
if !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
}
in_copyright = 0;
previous_chars = None;
prev_prepared_is_copy_start_with_year = false;
continue;
}
let is_standalone_comment_without_indicators =
is_standalone_comment_line(line) && !has_copyright_indicators(&prepared);
let is_indented_standalone_comment = line.trim_start().starts_with("/*")
&& line.trim_end().ends_with("*/")
&& !line.starts_with("/*");
if is_standalone_comment_without_indicators && is_indented_standalone_comment {
if !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
}
in_copyright = 0;
previous_chars = None;
prev_prepared_is_copy_start_with_year = false;
continue;
}
if co.is_empty() {
if let Some(ref prev) = previous_chars {
if !ends_with_continuation(prev) {
if !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
}
in_copyright = 0;
previous_chars = None;
prev_prepared_is_copy_start_with_year = false;
} else {
candidates.push((ln, prepare_text_line(line)));
in_copyright -= 1;
}
} else {
if !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
}
in_copyright = 0;
previous_chars = None;
prev_prepared_is_copy_start_with_year = false;
}
} else if is_tabular_noise_line(line) {
if !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
}
in_copyright = 0;
previous_chars = None;
prev_prepared_is_copy_start_with_year = false;
} else {
candidates.push((ln, prepare_text_line(line)));
in_copyright -= 1;
}
} else if !candidates.is_empty() {
groups.push(std::mem::take(&mut candidates));
in_copyright = 0;
previous_chars = None;
prev_prepared_is_copy_start_with_year = false;
}
}
if !candidates.is_empty() {
groups.push(candidates);
}
groups
}
fn is_copy_marker_start(s: &str) -> bool {
let mut t = s.trim_start();
if let Some(prefix) = t.get(.."portions".len())
&& prefix.eq_ignore_ascii_case("portions")
&& t.as_bytes()
.get("portions".len())
.is_none_or(|b| b.is_ascii_whitespace() || b.is_ascii_punctuation())
{
t = t
.get("portions".len()..)
.unwrap_or("")
.trim_start_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace());
}
(starts_with_c_sign(t)
|| t.get(.."copyright".len())
.is_some_and(|p| p.eq_ignore_ascii_case("copyright")))
&& hints::has_year(t)
}
fn starts_with_c_sign(s: &str) -> bool {
let s = s.trim_start();
let b = s.as_bytes();
b.len() >= 3 && b[0] == b'(' && (b[1] == b'c' || b[1] == b'C') && b[2] == b')'
}
fn is_obvious_code_line(line: &str) -> bool {
let t = line.trim_start();
if t.is_empty() {
return false;
}
let lower = t.to_ascii_lowercase();
if lower == "public" || lower == "private" || lower == "protected" {
return true;
}
if lower.starts_with("public ")
|| lower.starts_with("private ")
|| lower.starts_with("protected ")
|| lower.starts_with("static ")
|| lower.starts_with("const ")
|| lower.starts_with("class ")
|| lower.starts_with("struct ")
|| lower.starts_with("enum ")
|| lower.starts_with("interface ")
|| lower.starts_with("impl ")
|| lower.starts_with("fn ")
|| lower.starts_with("let ")
|| lower.starts_with("var ")
|| lower.starts_with("function ")
|| lower.starts_with("#define")
|| lower.starts_with("#include")
{
return true;
}
let stripped = t.trim_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace());
matches!(stripped, "{" | "}" | ";")
}
fn is_standalone_comment_line(line: &str) -> bool {
let t = line.trim();
if t.is_empty() {
return false;
}
let lower = t.to_ascii_lowercase();
if lower.starts_with("/*") && lower.ends_with("*/") {
return true;
}
false
}
fn is_year_only_copy_marker_line(prepared: &str) -> bool {
let mut s = prepared.trim_start();
if s.get(.."copyright".len())
.is_some_and(|prefix| prefix.eq_ignore_ascii_case("copyright"))
{
s = s.get("copyright".len()..).unwrap_or("");
}
s = s.trim_start();
if starts_with_c_sign(s) {
s = s.get(3..).unwrap_or("");
} else {
s = s.trim_start_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace());
if starts_with_c_sign(s) {
s = s.get(3..).unwrap_or("");
}
}
s = s.trim_start_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace());
let s = s.trim();
if s.is_empty() {
return true;
}
s.chars().all(|c| {
c.is_ascii_digit()
|| c.is_whitespace()
|| matches!(c, ',' | '-' | '+' | '(' | ')' | '.' | ';' | ':')
})
}
pub fn strip_balanced_edge_parens(s: &str) -> &str {
if s.starts_with('(') && s.ends_with(')') {
let inner = &s[1..s.len() - 1];
if !inner.contains('(')
&& !inner.contains(')')
&& !inner.to_ascii_lowercase().contains("http://")
&& !inner.to_ascii_lowercase().contains("https://")
&& !inner.to_ascii_lowercase().contains("ftp://")
&& !inner.to_ascii_lowercase().contains("www.")
&& !inner.contains('@')
{
return inner;
}
}
s
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chars_only_basic() {
assert_eq!(chars_only("Hello, World! 123"), "helloworld123");
}
#[test]
fn test_chars_only_empty() {
assert_eq!(chars_only(""), "");
}
#[test]
fn test_chars_only_only_punct() {
assert_eq!(chars_only("---...!!!"), "");
}
#[test]
fn test_eos_rights_reserved() {
assert!(is_end_of_statement("allrightsreserved"));
}
#[test]
fn test_eos_right_reserved() {
assert!(is_end_of_statement("allrightreserved"));
}
#[test]
fn test_eos_negative() {
assert!(!is_end_of_statement("copyright2024"));
}
#[test]
fn test_continuation_copyright() {
assert!(ends_with_continuation("somecopyright"));
}
#[test]
fn test_continuation_and() {
assert!(ends_with_continuation("fooand"));
}
#[test]
fn test_continuation_by() {
assert!(ends_with_continuation("writtenby"));
}
#[test]
fn test_continuation_year() {
assert!(ends_with_continuation("text2024"));
}
#[test]
fn test_continuation_negative() {
assert!(!ends_with_continuation("justtext"));
}
#[test]
fn test_continuation_empty() {
assert!(!ends_with_continuation(""));
}
#[test]
fn test_strip_balanced_simple() {
assert_eq!(strip_balanced_edge_parens("(Hello World)"), "Hello World");
}
#[test]
fn test_strip_balanced_unmatched_start() {
assert_eq!(strip_balanced_edge_parens("(Hello World"), "(Hello World");
}
#[test]
fn test_strip_balanced_unmatched_end() {
assert_eq!(strip_balanced_edge_parens("Hello World)"), "Hello World)");
}
#[test]
fn test_strip_balanced_inner_parens() {
assert_eq!(
strip_balanced_edge_parens("(Hello (World)"),
"(Hello (World)"
);
}
#[test]
fn test_strip_balanced_nested_parens() {
assert_eq!(
strip_balanced_edge_parens("(Hello (World))"),
"(Hello (World))"
);
}
#[test]
fn test_strip_balanced_no_parens() {
assert_eq!(strip_balanced_edge_parens("Hello World"), "Hello World");
}
#[test]
fn test_collect_empty_input() {
let lines: Vec<(usize, String)> = vec![];
let groups = collect_candidate_lines(lines);
assert!(groups.is_empty());
}
#[test]
fn test_collect_single_copyright_line() {
let lines = vec![(1, "Copyright 2024 Acme Inc.".to_string())];
let groups = collect_candidate_lines(lines);
assert_eq!(groups.len(), 1);
assert_eq!(groups[0].len(), 1);
assert_eq!(groups[0][0].0, 1);
}
#[test]
fn test_collect_non_candidate_lines() {
let lines = vec![
(1, "This is just code".to_string()),
(2, "More code here".to_string()),
];
let groups = collect_candidate_lines(lines);
assert!(groups.is_empty());
}
#[test]
fn test_collect_two_separate_copyrights() {
let lines = vec![
(1, "Copyright 2020 Foo".to_string()),
(2, "some random code".to_string()),
(3, "not related".to_string()),
(4, "also not related".to_string()),
(5, "Copyright 2024 Bar".to_string()),
];
let groups = collect_candidate_lines(lines);
assert_eq!(groups.len(), 2, "groups: {groups:?}");
}
#[test]
fn test_collect_end_of_statement_yields_immediately() {
let lines = vec![
(1, "Copyright 2024 Acme Inc.".to_string()),
(2, "All rights reserved.".to_string()),
(3, "Some other text".to_string()),
];
let groups = collect_candidate_lines(lines);
assert_eq!(groups.len(), 1, "groups: {groups:?}");
assert_eq!(groups[0].len(), 2);
}
#[test]
fn test_collect_continuation_with_trailing_year() {
let lines = vec![
(1, "Copyright 2024".to_string()),
(2, "".to_string()),
(3, "Acme Inc.".to_string()),
];
let groups = collect_candidate_lines(lines);
assert_eq!(groups.len(), 1, "groups: {groups:?}");
}
#[test]
fn test_collect_break_on_empty_without_continuation() {
let lines = vec![
(1, "Copyright 2024 Acme Inc.".to_string()),
(2, "Some additional text".to_string()),
(3, "".to_string()),
(4, "Copyright 2025 Bar".to_string()),
];
let groups = collect_candidate_lines(lines);
assert_eq!(groups.len(), 2, "groups: {groups:?}");
}
#[test]
fn test_collect_http_as_candidate() {
let lines = vec![(1, "http://example.com/copyright".to_string())];
let groups = collect_candidate_lines(lines);
assert_eq!(groups.len(), 1, "groups: {groups:?}");
}
#[test]
fn test_collect_debian_markup() {
let lines = vec![(1, "<s>John Doe</s>".to_string())];
let groups = collect_candidate_lines(lines);
assert_eq!(groups.len(), 1, "groups: {groups:?}");
}
#[test]
fn test_collect_multiline_copyright() {
let lines = vec![
(1, "Copyright (C) 2020-2024".to_string()),
(2, " Acme Corporation".to_string()),
(3, " All rights reserved.".to_string()),
];
let groups = collect_candidate_lines(lines);
assert_eq!(groups.len(), 1, "groups: {groups:?}");
assert_eq!(groups[0].len(), 3);
}
#[test]
fn test_noncopyright_at_directive_line_detection() {
assert!(is_noncopyright_at_directive_line(
"@lint-ignore-every FBOBJCIMPORTORDER1 METHOD_BRACKETSMETHOD_BRACKETS"
));
assert!(!is_noncopyright_at_directive_line("@author Jane Doe"));
assert!(!is_noncopyright_at_directive_line(
"@copyright 2024 Example Corp."
));
assert!(!is_noncopyright_at_directive_line("@History:"));
}
#[test]
fn test_collect_breaks_before_noncopyright_at_directive_line() {
let lines = vec![
(
1,
"// (c) Example Corp. and affiliates. Confidential and proprietary.".to_string(),
),
(
2,
"// @lint-ignore-every FBOBJCIMPORTORDER1 METHOD_BRACKETSMETHOD_BRACKETS"
.to_string(),
),
];
let groups = collect_candidate_lines(lines);
assert_eq!(groups.len(), 1, "groups: {groups:?}");
assert_eq!(groups[0].len(), 1, "groups: {groups:?}");
assert_eq!(
groups[0][0].1,
"(c) Example Corp. and affiliates. Confidential and proprietary."
);
}
#[test]
fn test_collect_remaining_candidates_at_end() {
let lines = vec![
(1, "Some preamble".to_string()),
(2, "Copyright 2024 Acme".to_string()),
];
let groups = collect_candidate_lines(lines);
assert_eq!(groups.len(), 1);
assert_eq!(groups[0][0].0, 2);
}
#[test]
fn test_indicators_copyright_word() {
assert!(has_copyright_indicators("blah Copyright 2024 blah"));
}
#[test]
fn test_indicators_copyleft() {
assert!(has_copyright_indicators("Copyleft notice here"));
}
#[test]
fn test_indicators_author() {
assert!(has_copyright_indicators("@author John Doe"));
}
#[test]
fn test_indicators_c_sign_with_year() {
assert!(has_copyright_indicators("(c)2024 Acme Inc."));
assert!(has_copyright_indicators("(C) 1996 Id Software"));
assert!(has_copyright_indicators("(c) 2020 Foo"));
}
#[test]
fn test_indicators_c_sign_code_pattern() {
assert!(!has_copyright_indicators("if(c){var r=[]}"));
assert!(!has_copyright_indicators("function(c){return c.length}"));
}
#[test]
fn test_indicators_no_match() {
assert!(!has_copyright_indicators("var x = 42; function foo() {}"));
assert!(!has_copyright_indicators(
"just some random @ text with right margin"
));
}
#[test]
fn test_collect_skips_swift_convention_c_signature_lines() {
let lines = vec![
(
1,
"let invokeSuperSetter: @convention(c) (NSObject, AnyClass, Selector, AnyObject?) -> Void = { object, superclass, selector, delegate in".to_string(),
),
(
2,
"typealias Setter = @convention(c) (NSObject, Selector, AnyObject?) -> Void"
.to_string(),
),
];
assert!(collect_candidate_lines(lines).is_empty());
}
#[test]
fn test_c_sign_year_adjacent() {
assert!(has_c_sign_before_year(b"(c)2024"));
}
#[test]
fn test_c_sign_year_with_space() {
assert!(has_c_sign_before_year(b"(c) 1996"));
}
#[test]
fn test_c_sign_year_uppercase() {
assert!(has_c_sign_before_year(b"(C)2024"));
}
#[test]
fn test_c_sign_code_brace() {
assert!(!has_c_sign_before_year(b"(c){var}"));
}
#[test]
fn test_c_sign_code_dot() {
assert!(!has_c_sign_before_year(b"(c).length"));
}
#[test]
fn test_c_sign_empty_after() {
assert!(!has_c_sign_before_year(b"(c)"));
}
#[test]
fn test_collect_skips_long_line_without_indicators() {
let long_line = "x".repeat(3000);
let lines = vec![
(1, "Copyright 2024 Acme".to_string()),
(2, long_line),
(3, "Copyright 2025 Bar".to_string()),
];
let groups = collect_candidate_lines(lines);
assert!(
!groups.is_empty(),
"Should still detect copyrights: {groups:?}"
);
}
#[test]
fn test_collect_keeps_long_line_with_copyright() {
let mut long_line = "x".repeat(2500);
long_line.push_str(" Copyright 2024 Acme Inc. ");
long_line.push_str(&"y".repeat(500));
let lines = vec![(1, long_line)];
let groups = collect_candidate_lines(lines);
assert_eq!(
groups.len(),
1,
"Should detect copyright in long line: {groups:?}"
);
}
#[test]
fn test_collect_keeps_long_line_with_c_sign_year() {
let mut long_line = "x".repeat(2500);
long_line.push_str(" (c)1996 Id Software ");
long_line.push_str(&"y".repeat(500));
let lines = vec![(1, long_line)];
let groups = collect_candidate_lines(lines);
assert_eq!(
groups.len(),
1,
"Should detect (c)year in long line: {groups:?}"
);
}
#[test]
fn test_contains_ascii_ci_found() {
assert!(contains_ascii_ci(b"Hello World", b"world"));
assert!(contains_ascii_ci(b"CoPyRiGhT", b"opyr"));
}
#[test]
fn test_contains_ascii_ci_not_found() {
assert!(!contains_ascii_ci(b"Hello World", b"xyz"));
}
#[test]
fn test_contains_ascii_ci_needle_longer() {
assert!(!contains_ascii_ci(b"Hi", b"Hello"));
}
#[test]
fn test_uuencode_full_data_line() {
let line = b"M?T5,1@$\"`0`````````````!``@````!`````````````]'0```0`0`T````";
assert!(is_uuencode_data_line(line));
}
#[test]
fn test_uuencode_short_data_line() {
let line = b"1`@``*%P```(\"```H8````@(`";
assert!(is_uuencode_data_line(line));
assert!(!is_encoded_data_line(std::str::from_utf8(line).unwrap()));
}
#[test]
fn test_uuencode_not_natural_text() {
let line = b"This is a normal English sentence with spaces between words here";
assert!(!is_uuencode_data_line(line));
}
#[test]
fn test_uuencode_not_copyright_line() {
let line = b" * Copyright (c) 2002-2006 Sam Leffler, Errno Consulting, Atheros";
assert!(!is_uuencode_data_line(line));
}
#[test]
fn test_uuencode_not_comment_decorator() {
let line = b"/************************************************************************/";
assert!(!is_uuencode_data_line(line));
}
#[test]
fn test_uuencode_not_star_line() {
let line = b"************************************************************";
assert!(!is_uuencode_data_line(line));
}
#[test]
fn test_uuencode_not_dash_line() {
let line = b"------------------------------------------------------------";
assert!(!is_uuencode_data_line(line));
}
#[test]
fn test_base64_typical_line() {
let line = b"SGVsbG8gV29ybGQhIFRoaXMgaXMgYSBiYXNlNjQgZW5jb2RlZCBzdHJpbmc=";
assert!(is_base64_data_line(line));
}
#[test]
fn test_base64_with_plus_slash() {
let line = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
assert!(is_base64_data_line(line));
}
#[test]
fn test_base64_not_text_with_spaces() {
let line = b"This is not base64 because it has spaces in it right here";
assert!(!is_base64_data_line(line));
}
#[test]
fn test_base64_not_url() {
let line = b"http://code.google.com/apis/protocolbuffers/";
assert!(!is_base64_data_line(line));
}
#[test]
fn test_base64_not_file_path() {
let line = b"/usr/local/lib/python3.11/site-packages/some_package/module.py";
assert!(!is_base64_data_line(line));
}
#[test]
fn test_encoded_skips_short_lines() {
assert!(!is_encoded_data_line("short"));
assert!(!is_encoded_data_line("M`@``")); }
#[test]
fn test_encoded_preserves_copyright_indicators() {
let line = "M".to_string() + &"`".repeat(20) + "Copyright" + &"`".repeat(30);
assert!(!is_encoded_data_line(&line));
}
#[test]
fn test_encoded_detects_uuencode() {
let line = "M?T5,1@$\"`0`````````````!``@````!`````````````]'0```0`0`T````";
assert!(is_encoded_data_line(line));
}
#[test]
fn test_encoded_detects_base64() {
let line = "SGVsbG8gV29ybGQhIFRoaXMgaXMgYSBiYXNlNjQgZW5jb2RlZCBzdHJpbmc=";
assert!(is_encoded_data_line(line));
}
#[test]
fn test_encoded_preserves_normal_text() {
assert!(!is_encoded_data_line(
"This is a normal line of source code with various characters"
));
}
#[test]
fn test_encoded_preserves_email_line() {
assert!(!is_encoded_data_line(
"Contact us at support@example.com for more information about this"
));
}
#[test]
fn test_collect_skips_uuencode_data_lines() {
let uu_line = "M?T5,1@$\"`0`````````````!``@````!`````````````]'0```0`0`T````";
let lines = vec![
(1, "Copyright 2024 Acme".to_string()),
(2, uu_line.to_string()),
(3, uu_line.to_string()),
(4, uu_line.to_string()),
(5, "Copyright 2025 Bar".to_string()),
];
let groups = collect_candidate_lines(lines);
assert_eq!(groups.len(), 2, "Should detect both copyrights: {groups:?}");
}
#[test]
fn test_collect_preserves_copyright_in_uuencode_file() {
let uu_line = "M?T5,1@$\"`0`````````````!``@````!`````````````]'0```0`0`T````";
let mut lines: Vec<(usize, String)> = vec![
(
1,
" * Copyright (c) 2002-2006 Sam Leffler, Errno Consulting, Atheros".to_string(),
),
(
2,
" * Communications, Inc. All rights reserved.".to_string(),
),
];
for i in 3..103 {
lines.push((i, uu_line.to_string()));
}
let groups = collect_candidate_lines(lines);
assert!(!groups.is_empty(), "Should detect copyright header");
assert_eq!(groups[0].len(), 2, "Copyright group should have 2 lines");
}
#[test]
fn test_collect_keeps_versioned_project_banner_with_mixed_case_brand_holder() {
let lines = vec![(
1,
"/*! jQuery v2.2.0 | (c) jQuery Foundation | jquery.org/license */".to_string(),
)];
let groups = collect_candidate_lines(lines);
assert_eq!(groups.len(), 1, "groups: {groups:?}");
assert_eq!(groups[0].len(), 1, "groups: {groups:?}");
}
#[test]
fn test_versioned_banner_holder_from_prepared_extracts_holder() {
let prepared =
"! jQuery v3.7.1 (c) OpenJS Foundation and other contributors jquery.org/license";
assert_eq!(
versioned_banner_holder_from_prepared(prepared),
Some("OpenJS Foundation and other contributors".to_string())
);
}
}