Skip to main content

provenant/parsers/
readme.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for third-party attribution README files.
5//!
6//! Extracts package metadata from semi-structured README files used to document
7//! third-party dependencies in Android, Chromium, Facebook, Google, and similar codebases.
8//!
9//! # Supported Formats
10//! - README.android
11//! - README.chromium
12//! - README.facebook
13//! - README.google
14//! - README.thirdparty
15//!
16//! # Key Features
17//! - Key:value pair extraction (both `:` and `=` separators)
18//! - Parent directory name fallback for packages without explicit names
19//! - Field name mapping to standardized PackageData fields
20//!
21//! # Implementation Notes
22//! - Keys are matched case-insensitively
23//! - Lines without valid separators are skipped
24//! - Multiple URL-related keys map to homepage_url (repo, source, upstream, etc.)
25//! - Separator precedence: the first separator (`:` or `=`) on each line is used
26
27use crate::models::PackageData;
28use crate::models::{DatasourceId, PackageType};
29use crate::parser_warn as warn;
30use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
31use std::path::Path;
32
33use super::PackageParser;
34
35/// README attribution file parser.
36///
37/// Extracts package metadata from semi-structured README files commonly used
38/// to document third-party dependencies in large codebases.
39pub struct ReadmeParser;
40
41impl PackageParser for ReadmeParser {
42    const PACKAGE_TYPE: PackageType = PackageType::Readme;
43
44    fn is_match(path: &Path) -> bool {
45        path.file_name().is_some_and(|name| {
46            let name = name.to_string_lossy().to_lowercase();
47            matches!(
48                name.as_str(),
49                "readme.android"
50                    | "readme.chromium"
51                    | "readme.facebook"
52                    | "readme.google"
53                    | "readme.thirdparty"
54            )
55        })
56    }
57
58    fn extract_packages(path: &Path) -> Vec<PackageData> {
59        let content = match read_file_to_string(path, None) {
60            Ok(content) => content,
61            Err(e) => {
62                warn!("Failed to read README file at {:?}: {}", path, e);
63                return vec![default_package_data()];
64            }
65        };
66
67        let mut pkg = default_package_data();
68
69        // Parse key:value pairs
70        for line in content.lines().take(MAX_ITERATION_COUNT) {
71            let line = line.trim();
72            if line.is_empty() {
73                continue;
74            }
75
76            let split_colon = line.split_once(':');
77            let split_equals = line.split_once('=');
78
79            let (key, value) = match (split_colon, split_equals) {
80                (Some((ck, cv)), Some((ek, _))) if ck.len() <= ek.len() => (ck.trim(), cv.trim()),
81                (_, Some((ek, ev))) => (ek.trim(), ev.trim()),
82                (Some((ck, cv)), None) => (ck.trim(), cv.trim()),
83                (None, None) => continue,
84            };
85
86            if key.is_empty() || value.is_empty() {
87                continue;
88            }
89
90            // Map README field to PackageData field (case-insensitive)
91            let key_lower = key.to_lowercase();
92            match key_lower.as_str() {
93                "name" | "project" => {
94                    pkg.name = Some(truncate_field(value.to_string()));
95                }
96                "version" => {
97                    pkg.version = Some(truncate_field(value.to_string()));
98                }
99                "copyright" => {
100                    pkg.copyright = Some(truncate_field(value.to_string()));
101                }
102                "download link" | "downloaded from" => {
103                    pkg.download_url = Some(truncate_field(value.to_string()));
104                }
105                "homepage" | "website" | "repo" | "source" | "upstream" | "url" | "project url" => {
106                    pkg.homepage_url = Some(truncate_field(value.to_string()));
107                }
108                "licence" | "license" => {
109                    pkg.extracted_license_statement = Some(truncate_field(value.to_string()));
110                }
111                _ => {
112                    // Unrecognized field, skip
113                }
114            }
115        }
116
117        // Fallback: use parent directory name if no name was found
118        if pkg.name.is_none()
119            && let Some(parent) = path.parent()
120            && let Some(parent_name) = parent.file_name()
121        {
122            pkg.name = Some(truncate_field(parent_name.to_string_lossy().to_string()));
123        }
124
125        vec![pkg]
126    }
127}
128
129fn default_package_data() -> PackageData {
130    PackageData {
131        package_type: Some(ReadmeParser::PACKAGE_TYPE),
132        datasource_id: Some(DatasourceId::Readme),
133        ..Default::default()
134    }
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140    use std::path::PathBuf;
141
142    #[test]
143    fn test_is_match_android() {
144        let valid = PathBuf::from("/some/path/README.android");
145        assert!(ReadmeParser::is_match(&valid));
146    }
147
148    #[test]
149    fn test_is_match_chromium() {
150        let valid = PathBuf::from("/some/path/README.chromium");
151        assert!(ReadmeParser::is_match(&valid));
152    }
153
154    #[test]
155    fn test_is_match_facebook() {
156        let valid = PathBuf::from("/some/path/README.facebook");
157        assert!(ReadmeParser::is_match(&valid));
158    }
159
160    #[test]
161    fn test_is_match_google() {
162        let valid = PathBuf::from("/some/path/README.google");
163        assert!(ReadmeParser::is_match(&valid));
164    }
165
166    #[test]
167    fn test_is_match_thirdparty() {
168        let valid = PathBuf::from("/some/path/README.thirdparty");
169        assert!(ReadmeParser::is_match(&valid));
170    }
171
172    #[test]
173    fn test_is_match_case_insensitive() {
174        let upper = PathBuf::from("/some/path/README.CHROMIUM");
175        let mixed = PathBuf::from("/some/path/README.ChRoMiUm");
176        assert!(ReadmeParser::is_match(&upper));
177        assert!(ReadmeParser::is_match(&mixed));
178    }
179
180    #[test]
181    fn test_is_match_negative_cases() {
182        let readme_md = PathBuf::from("/some/path/README.md");
183        let readme_txt = PathBuf::from("/some/path/README.txt");
184        let readme = PathBuf::from("/some/path/README");
185        let other = PathBuf::from("/some/path/INSTALL.txt");
186
187        assert!(!ReadmeParser::is_match(&readme_md));
188        assert!(!ReadmeParser::is_match(&readme_txt));
189        assert!(!ReadmeParser::is_match(&readme));
190        assert!(!ReadmeParser::is_match(&other));
191    }
192
193    #[test]
194    fn test_extract_chromium_format() {
195        let path = PathBuf::from("testdata/readme/chromium/third_party/example/README.chromium");
196        let pkg = ReadmeParser::extract_first_package(&path);
197
198        assert_eq!(pkg.package_type, Some(PackageType::Readme));
199        assert_eq!(pkg.name, Some("Example Library".to_string()));
200        assert_eq!(pkg.version, Some("2.1.0".to_string()));
201        assert_eq!(pkg.homepage_url, Some("https://example.com".to_string()));
202        assert_eq!(pkg.extracted_license_statement, Some("MIT".to_string()));
203        assert_eq!(pkg.datasource_id, Some(DatasourceId::Readme));
204    }
205
206    #[test]
207    fn test_extract_android_format() {
208        let path = PathBuf::from("testdata/readme/android/third_party/example/README.android");
209        let pkg = ReadmeParser::extract_first_package(&path);
210
211        assert_eq!(pkg.name, Some("Android Example".to_string()));
212        assert_eq!(pkg.version, Some("1.0".to_string()));
213        assert_eq!(
214            pkg.homepage_url,
215            Some("https://android.example.com".to_string())
216        );
217        assert_eq!(pkg.copyright, Some("2024 Google Inc.".to_string()));
218    }
219
220    #[test]
221    fn test_extract_facebook_format() {
222        let path = PathBuf::from("testdata/readme/facebook/third_party/example/README.facebook");
223        let pkg = ReadmeParser::extract_first_package(&path);
224
225        assert_eq!(pkg.name, Some("FB Library".to_string()));
226        assert_eq!(
227            pkg.download_url,
228            Some("https://github.com/example/fb-lib".to_string())
229        );
230        assert_eq!(
231            pkg.extracted_license_statement,
232            Some("BSD-3-Clause".to_string())
233        );
234    }
235
236    #[test]
237    fn test_extract_parent_dir_fallback() {
238        let path = PathBuf::from("testdata/readme/no-name/third_party/mylib/README.thirdparty");
239        let pkg = ReadmeParser::extract_first_package(&path);
240
241        // Should use parent directory name "mylib" since no name field in file
242        assert_eq!(pkg.name, Some("mylib".to_string()));
243        assert_eq!(pkg.homepage_url, Some("https://example.com".to_string()));
244        assert_eq!(pkg.version, Some("3.0".to_string()));
245    }
246
247    #[test]
248    fn test_extract_equals_separator() {
249        let path =
250            PathBuf::from("testdata/readme/equals-separator/third_party/eqlib/README.google");
251        let pkg = ReadmeParser::extract_first_package(&path);
252
253        assert_eq!(pkg.name, Some("Google Lib".to_string()));
254        assert_eq!(
255            pkg.homepage_url,
256            Some("https://google.example.com".to_string())
257        );
258        assert_eq!(
259            pkg.extracted_license_statement,
260            Some("Apache-2.0".to_string())
261        );
262    }
263
264    #[test]
265    fn test_case_insensitive_field_names() {
266        let path = PathBuf::from("testdata/readme/chromium/third_party/example/README.chromium");
267        let pkg = ReadmeParser::extract_first_package(&path);
268
269        // The test file uses "Name:", "URL:", "Version:", "License:"
270        // All should be recognized despite capitalization
271        assert!(pkg.name.is_some());
272        assert!(pkg.version.is_some());
273        assert!(pkg.homepage_url.is_some());
274        assert!(pkg.extracted_license_statement.is_some());
275    }
276
277    #[test]
278    fn test_invalid_file() {
279        let nonexistent = PathBuf::from("testdata/readme/nonexistent/README.chromium");
280        let pkg = ReadmeParser::extract_first_package(&nonexistent);
281
282        // Should return default data with proper type and datasource
283        assert_eq!(pkg.package_type, Some(PackageType::Readme));
284        assert_eq!(pkg.datasource_id, Some(DatasourceId::Readme));
285    }
286}
287
288crate::register_parser!(
289    "Third-party attribution README files",
290    &[
291        "**/README.android",
292        "**/README.chromium",
293        "**/README.facebook",
294        "**/README.google",
295        "**/README.thirdparty"
296    ],
297    "readme",
298    "",
299    Some(
300        "https://github.com/chromium/chromium/blob/main/docs/contributing.md#third_party-components"
301    ),
302);