Skip to main content

provenant/parsers/
readme.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for third-party attribution README files.
5//!
6//! Extracts package metadata from semi-structured README files used to document
7//! third-party dependencies in Android, Chromium, Facebook, Google, and similar codebases.
8//!
9//! # Supported Formats
10//! - README.android
11//! - README.chromium
12//! - README.facebook
13//! - README.google
14//! - README.thirdparty
15//!
16//! # Key Features
17//! - Key:value pair extraction (both `:` and `=` separators)
18//! - Parent directory name fallback for packages without explicit names
19//! - Field name mapping to standardized PackageData fields
20//!
21//! # Implementation Notes
22//! - Keys are matched case-insensitively
23//! - Lines without valid separators are skipped
24//! - Multiple URL-related keys map to homepage_url (repo, source, upstream, etc.)
25//! - Separator precedence: the first separator (`:` or `=`) on each line is used
26
27use crate::models::PackageData;
28use crate::models::{DatasourceId, PackageType};
29use crate::parser_warn as warn;
30use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
31use std::path::Path;
32
33use super::PackageParser;
34use super::metadata::ParserMetadata;
35
36/// README attribution file parser.
37///
38/// Extracts package metadata from semi-structured README files commonly used
39/// to document third-party dependencies in large codebases.
40pub struct ReadmeParser;
41
42impl PackageParser for ReadmeParser {
43    const PACKAGE_TYPE: PackageType = PackageType::Readme;
44
45    fn metadata() -> Vec<ParserMetadata> {
46        vec![ParserMetadata {
47            description: "Third-party attribution README files",
48            file_patterns: &[
49                "**/README.android",
50                "**/README.chromium",
51                "**/README.facebook",
52                "**/README.google",
53                "**/README.thirdparty",
54            ],
55            package_type: "readme",
56            primary_language: "",
57            documentation_url: Some(
58                "https://github.com/chromium/chromium/blob/main/docs/contributing.md#third_party-components",
59            ),
60        }]
61    }
62
63    fn is_match(path: &Path) -> bool {
64        path.file_name().is_some_and(|name| {
65            let name = name.to_string_lossy().to_lowercase();
66            matches!(
67                name.as_str(),
68                "readme.android"
69                    | "readme.chromium"
70                    | "readme.facebook"
71                    | "readme.google"
72                    | "readme.thirdparty"
73            )
74        })
75    }
76
77    fn extract_packages(path: &Path) -> Vec<PackageData> {
78        let content = match read_file_to_string(path, None) {
79            Ok(content) => content,
80            Err(e) => {
81                warn!("Failed to read README file at {:?}: {}", path, e);
82                return vec![default_package_data()];
83            }
84        };
85
86        let mut pkg = default_package_data();
87
88        // Parse key:value pairs
89        for line in content.lines().take(MAX_ITERATION_COUNT) {
90            let line = line.trim();
91            if line.is_empty() {
92                continue;
93            }
94
95            let split_colon = line.split_once(':');
96            let split_equals = line.split_once('=');
97
98            let (key, value) = match (split_colon, split_equals) {
99                (Some((ck, cv)), Some((ek, _))) if ck.len() <= ek.len() => (ck.trim(), cv.trim()),
100                (_, Some((ek, ev))) => (ek.trim(), ev.trim()),
101                (Some((ck, cv)), None) => (ck.trim(), cv.trim()),
102                (None, None) => continue,
103            };
104
105            if key.is_empty() || value.is_empty() {
106                continue;
107            }
108
109            // Map README field to PackageData field (case-insensitive)
110            let key_lower = key.to_lowercase();
111            match key_lower.as_str() {
112                "name" | "project" => {
113                    pkg.name = Some(truncate_field(value.to_string()));
114                }
115                "version" => {
116                    pkg.version = Some(truncate_field(value.to_string()));
117                }
118                "copyright" => {
119                    pkg.copyright = Some(truncate_field(value.to_string()));
120                }
121                "download link" | "downloaded from" => {
122                    pkg.download_url = Some(truncate_field(value.to_string()));
123                }
124                "homepage" | "website" | "repo" | "source" | "upstream" | "url" | "project url" => {
125                    pkg.homepage_url = Some(truncate_field(value.to_string()));
126                }
127                "licence" | "license" => {
128                    pkg.extracted_license_statement = Some(truncate_field(value.to_string()));
129                }
130                _ => {
131                    // Unrecognized field, skip
132                }
133            }
134        }
135
136        // Fallback: use parent directory name if no name was found
137        if pkg.name.is_none()
138            && let Some(parent) = path.parent()
139            && let Some(parent_name) = parent.file_name()
140        {
141            pkg.name = Some(truncate_field(parent_name.to_string_lossy().to_string()));
142        }
143
144        vec![pkg]
145    }
146}
147
148fn default_package_data() -> PackageData {
149    PackageData {
150        package_type: Some(ReadmeParser::PACKAGE_TYPE),
151        datasource_id: Some(DatasourceId::Readme),
152        ..Default::default()
153    }
154}
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159    use std::path::PathBuf;
160
161    #[test]
162    fn test_is_match_android() {
163        let valid = PathBuf::from("/some/path/README.android");
164        assert!(ReadmeParser::is_match(&valid));
165    }
166
167    #[test]
168    fn test_is_match_chromium() {
169        let valid = PathBuf::from("/some/path/README.chromium");
170        assert!(ReadmeParser::is_match(&valid));
171    }
172
173    #[test]
174    fn test_is_match_facebook() {
175        let valid = PathBuf::from("/some/path/README.facebook");
176        assert!(ReadmeParser::is_match(&valid));
177    }
178
179    #[test]
180    fn test_is_match_google() {
181        let valid = PathBuf::from("/some/path/README.google");
182        assert!(ReadmeParser::is_match(&valid));
183    }
184
185    #[test]
186    fn test_is_match_thirdparty() {
187        let valid = PathBuf::from("/some/path/README.thirdparty");
188        assert!(ReadmeParser::is_match(&valid));
189    }
190
191    #[test]
192    fn test_is_match_case_insensitive() {
193        let upper = PathBuf::from("/some/path/README.CHROMIUM");
194        let mixed = PathBuf::from("/some/path/README.ChRoMiUm");
195        assert!(ReadmeParser::is_match(&upper));
196        assert!(ReadmeParser::is_match(&mixed));
197    }
198
199    #[test]
200    fn test_is_match_negative_cases() {
201        let readme_md = PathBuf::from("/some/path/README.md");
202        let readme_txt = PathBuf::from("/some/path/README.txt");
203        let readme = PathBuf::from("/some/path/README");
204        let other = PathBuf::from("/some/path/INSTALL.txt");
205
206        assert!(!ReadmeParser::is_match(&readme_md));
207        assert!(!ReadmeParser::is_match(&readme_txt));
208        assert!(!ReadmeParser::is_match(&readme));
209        assert!(!ReadmeParser::is_match(&other));
210    }
211
212    #[test]
213    fn test_extract_chromium_format() {
214        let path = PathBuf::from("testdata/readme/chromium/third_party/example/README.chromium");
215        let pkg = ReadmeParser::extract_first_package(&path);
216
217        assert_eq!(pkg.package_type, Some(PackageType::Readme));
218        assert_eq!(pkg.name, Some("Example Library".to_string()));
219        assert_eq!(pkg.version, Some("2.1.0".to_string()));
220        assert_eq!(pkg.homepage_url, Some("https://example.com".to_string()));
221        assert_eq!(pkg.extracted_license_statement, Some("MIT".to_string()));
222        assert_eq!(pkg.datasource_id, Some(DatasourceId::Readme));
223    }
224
225    #[test]
226    fn test_extract_android_format() {
227        let path = PathBuf::from("testdata/readme/android/third_party/example/README.android");
228        let pkg = ReadmeParser::extract_first_package(&path);
229
230        assert_eq!(pkg.name, Some("Android Example".to_string()));
231        assert_eq!(pkg.version, Some("1.0".to_string()));
232        assert_eq!(
233            pkg.homepage_url,
234            Some("https://android.example.com".to_string())
235        );
236        assert_eq!(pkg.copyright, Some("2024 Google Inc.".to_string()));
237    }
238
239    #[test]
240    fn test_extract_facebook_format() {
241        let path = PathBuf::from("testdata/readme/facebook/third_party/example/README.facebook");
242        let pkg = ReadmeParser::extract_first_package(&path);
243
244        assert_eq!(pkg.name, Some("FB Library".to_string()));
245        assert_eq!(
246            pkg.download_url,
247            Some("https://github.com/example/fb-lib".to_string())
248        );
249        assert_eq!(
250            pkg.extracted_license_statement,
251            Some("BSD-3-Clause".to_string())
252        );
253    }
254
255    #[test]
256    fn test_extract_parent_dir_fallback() {
257        let path = PathBuf::from("testdata/readme/no-name/third_party/mylib/README.thirdparty");
258        let pkg = ReadmeParser::extract_first_package(&path);
259
260        // Should use parent directory name "mylib" since no name field in file
261        assert_eq!(pkg.name, Some("mylib".to_string()));
262        assert_eq!(pkg.homepage_url, Some("https://example.com".to_string()));
263        assert_eq!(pkg.version, Some("3.0".to_string()));
264    }
265
266    #[test]
267    fn test_extract_equals_separator() {
268        let path =
269            PathBuf::from("testdata/readme/equals-separator/third_party/eqlib/README.google");
270        let pkg = ReadmeParser::extract_first_package(&path);
271
272        assert_eq!(pkg.name, Some("Google Lib".to_string()));
273        assert_eq!(
274            pkg.homepage_url,
275            Some("https://google.example.com".to_string())
276        );
277        assert_eq!(
278            pkg.extracted_license_statement,
279            Some("Apache-2.0".to_string())
280        );
281    }
282
283    #[test]
284    fn test_case_insensitive_field_names() {
285        let path = PathBuf::from("testdata/readme/chromium/third_party/example/README.chromium");
286        let pkg = ReadmeParser::extract_first_package(&path);
287
288        // The test file uses "Name:", "URL:", "Version:", "License:"
289        // All should be recognized despite capitalization
290        assert!(pkg.name.is_some());
291        assert!(pkg.version.is_some());
292        assert!(pkg.homepage_url.is_some());
293        assert!(pkg.extracted_license_statement.is_some());
294    }
295
296    #[test]
297    fn test_invalid_file() {
298        let nonexistent = PathBuf::from("testdata/readme/nonexistent/README.chromium");
299        let pkg = ReadmeParser::extract_first_package(&nonexistent);
300
301        // Should return default data with proper type and datasource
302        assert_eq!(pkg.package_type, Some(PackageType::Readme));
303        assert_eq!(pkg.datasource_id, Some(DatasourceId::Readme));
304    }
305}