Skip to main content

provenant/parsers/
readme.rs

1//! Parser for third-party attribution README files.
2//!
3//! Extracts package metadata from semi-structured README files used to document
4//! third-party dependencies in Android, Chromium, Facebook, Google, and similar codebases.
5//!
6//! # Supported Formats
7//! - README.android
8//! - README.chromium
9//! - README.facebook
10//! - README.google
11//! - README.thirdparty
12//!
13//! # Key Features
14//! - Key:value pair extraction (both `:` and `=` separators)
15//! - Parent directory name fallback for packages without explicit names
16//! - Field name mapping to standardized PackageData fields
17//!
18//! # Implementation Notes
19//! - Keys are matched case-insensitively
20//! - Lines without valid separators are skipped
21//! - Multiple URL-related keys map to homepage_url (repo, source, upstream, etc.)
22//! - Separator precedence: the first separator (`:` or `=`) on each line is used
23
24use crate::models::PackageData;
25use crate::models::{DatasourceId, PackageType};
26use crate::parsers::utils::read_file_to_string;
27use log::warn;
28use std::path::Path;
29
30use super::PackageParser;
31
32/// README attribution file parser.
33///
34/// Extracts package metadata from semi-structured README files commonly used
35/// to document third-party dependencies in large codebases.
36pub struct ReadmeParser;
37
38impl PackageParser for ReadmeParser {
39    const PACKAGE_TYPE: PackageType = PackageType::Readme;
40
41    fn is_match(path: &Path) -> bool {
42        path.file_name().is_some_and(|name| {
43            let name = name.to_string_lossy().to_lowercase();
44            matches!(
45                name.as_str(),
46                "readme.android"
47                    | "readme.chromium"
48                    | "readme.facebook"
49                    | "readme.google"
50                    | "readme.thirdparty"
51            )
52        })
53    }
54
55    fn extract_packages(path: &Path) -> Vec<PackageData> {
56        let content = match read_file_to_string(path) {
57            Ok(content) => content,
58            Err(e) => {
59                warn!("Failed to read README file at {:?}: {}", path, e);
60                return vec![default_package_data()];
61            }
62        };
63
64        let mut pkg = default_package_data();
65
66        // Parse key:value pairs
67        for line in content.lines() {
68            let line = line.trim();
69            if line.is_empty() {
70                continue;
71            }
72
73            let split_colon = line.split_once(':');
74            let split_equals = line.split_once('=');
75
76            let (key, value) = match (split_colon, split_equals) {
77                (Some((ck, cv)), Some((ek, _))) if ck.len() <= ek.len() => (ck.trim(), cv.trim()),
78                (_, Some((ek, ev))) => (ek.trim(), ev.trim()),
79                (Some((ck, cv)), None) => (ck.trim(), cv.trim()),
80                (None, None) => continue,
81            };
82
83            if key.is_empty() || value.is_empty() {
84                continue;
85            }
86
87            // Map README field to PackageData field (case-insensitive)
88            let key_lower = key.to_lowercase();
89            match key_lower.as_str() {
90                "name" | "project" => {
91                    pkg.name = Some(value.to_string());
92                }
93                "version" => {
94                    pkg.version = Some(value.to_string());
95                }
96                "copyright" => {
97                    pkg.copyright = Some(value.to_string());
98                }
99                "download link" | "downloaded from" => {
100                    pkg.download_url = Some(value.to_string());
101                }
102                "homepage" | "website" | "repo" | "source" | "upstream" | "url" | "project url" => {
103                    pkg.homepage_url = Some(value.to_string());
104                }
105                "licence" | "license" => {
106                    pkg.extracted_license_statement = Some(value.to_string());
107                }
108                _ => {
109                    // Unrecognized field, skip
110                }
111            }
112        }
113
114        // Fallback: use parent directory name if no name was found
115        if pkg.name.is_none()
116            && let Some(parent) = path.parent()
117            && let Some(parent_name) = parent.file_name()
118        {
119            pkg.name = Some(parent_name.to_string_lossy().to_string());
120        }
121
122        vec![pkg]
123    }
124}
125
126fn default_package_data() -> PackageData {
127    PackageData {
128        package_type: Some(ReadmeParser::PACKAGE_TYPE),
129        datasource_id: Some(DatasourceId::Readme),
130        ..Default::default()
131    }
132}
133
134#[cfg(test)]
135mod tests {
136    use super::*;
137    use std::path::PathBuf;
138
139    #[test]
140    fn test_is_match_android() {
141        let valid = PathBuf::from("/some/path/README.android");
142        assert!(ReadmeParser::is_match(&valid));
143    }
144
145    #[test]
146    fn test_is_match_chromium() {
147        let valid = PathBuf::from("/some/path/README.chromium");
148        assert!(ReadmeParser::is_match(&valid));
149    }
150
151    #[test]
152    fn test_is_match_facebook() {
153        let valid = PathBuf::from("/some/path/README.facebook");
154        assert!(ReadmeParser::is_match(&valid));
155    }
156
157    #[test]
158    fn test_is_match_google() {
159        let valid = PathBuf::from("/some/path/README.google");
160        assert!(ReadmeParser::is_match(&valid));
161    }
162
163    #[test]
164    fn test_is_match_thirdparty() {
165        let valid = PathBuf::from("/some/path/README.thirdparty");
166        assert!(ReadmeParser::is_match(&valid));
167    }
168
169    #[test]
170    fn test_is_match_case_insensitive() {
171        let upper = PathBuf::from("/some/path/README.CHROMIUM");
172        let mixed = PathBuf::from("/some/path/README.ChRoMiUm");
173        assert!(ReadmeParser::is_match(&upper));
174        assert!(ReadmeParser::is_match(&mixed));
175    }
176
177    #[test]
178    fn test_is_match_negative_cases() {
179        let readme_md = PathBuf::from("/some/path/README.md");
180        let readme_txt = PathBuf::from("/some/path/README.txt");
181        let readme = PathBuf::from("/some/path/README");
182        let other = PathBuf::from("/some/path/INSTALL.txt");
183
184        assert!(!ReadmeParser::is_match(&readme_md));
185        assert!(!ReadmeParser::is_match(&readme_txt));
186        assert!(!ReadmeParser::is_match(&readme));
187        assert!(!ReadmeParser::is_match(&other));
188    }
189
190    #[test]
191    fn test_extract_chromium_format() {
192        let path = PathBuf::from("testdata/readme/chromium/third_party/example/README.chromium");
193        let pkg = ReadmeParser::extract_first_package(&path);
194
195        assert_eq!(pkg.package_type, Some(PackageType::Readme));
196        assert_eq!(pkg.name, Some("Example Library".to_string()));
197        assert_eq!(pkg.version, Some("2.1.0".to_string()));
198        assert_eq!(pkg.homepage_url, Some("https://example.com".to_string()));
199        assert_eq!(pkg.extracted_license_statement, Some("MIT".to_string()));
200        assert_eq!(pkg.datasource_id, Some(DatasourceId::Readme));
201    }
202
203    #[test]
204    fn test_extract_android_format() {
205        let path = PathBuf::from("testdata/readme/android/third_party/example/README.android");
206        let pkg = ReadmeParser::extract_first_package(&path);
207
208        assert_eq!(pkg.name, Some("Android Example".to_string()));
209        assert_eq!(pkg.version, Some("1.0".to_string()));
210        assert_eq!(
211            pkg.homepage_url,
212            Some("https://android.example.com".to_string())
213        );
214        assert_eq!(pkg.copyright, Some("2024 Google Inc.".to_string()));
215    }
216
217    #[test]
218    fn test_extract_facebook_format() {
219        let path = PathBuf::from("testdata/readme/facebook/third_party/example/README.facebook");
220        let pkg = ReadmeParser::extract_first_package(&path);
221
222        assert_eq!(pkg.name, Some("FB Library".to_string()));
223        assert_eq!(
224            pkg.download_url,
225            Some("https://github.com/example/fb-lib".to_string())
226        );
227        assert_eq!(
228            pkg.extracted_license_statement,
229            Some("BSD-3-Clause".to_string())
230        );
231    }
232
233    #[test]
234    fn test_extract_parent_dir_fallback() {
235        let path = PathBuf::from("testdata/readme/no-name/third_party/mylib/README.thirdparty");
236        let pkg = ReadmeParser::extract_first_package(&path);
237
238        // Should use parent directory name "mylib" since no name field in file
239        assert_eq!(pkg.name, Some("mylib".to_string()));
240        assert_eq!(pkg.homepage_url, Some("https://example.com".to_string()));
241        assert_eq!(pkg.version, Some("3.0".to_string()));
242    }
243
244    #[test]
245    fn test_extract_equals_separator() {
246        let path =
247            PathBuf::from("testdata/readme/equals-separator/third_party/eqlib/README.google");
248        let pkg = ReadmeParser::extract_first_package(&path);
249
250        assert_eq!(pkg.name, Some("Google Lib".to_string()));
251        assert_eq!(
252            pkg.homepage_url,
253            Some("https://google.example.com".to_string())
254        );
255        assert_eq!(
256            pkg.extracted_license_statement,
257            Some("Apache-2.0".to_string())
258        );
259    }
260
261    #[test]
262    fn test_case_insensitive_field_names() {
263        let path = PathBuf::from("testdata/readme/chromium/third_party/example/README.chromium");
264        let pkg = ReadmeParser::extract_first_package(&path);
265
266        // The test file uses "Name:", "URL:", "Version:", "License:"
267        // All should be recognized despite capitalization
268        assert!(pkg.name.is_some());
269        assert!(pkg.version.is_some());
270        assert!(pkg.homepage_url.is_some());
271        assert!(pkg.extracted_license_statement.is_some());
272    }
273
274    #[test]
275    fn test_invalid_file() {
276        let nonexistent = PathBuf::from("testdata/readme/nonexistent/README.chromium");
277        let pkg = ReadmeParser::extract_first_package(&nonexistent);
278
279        // Should return default data with proper type and datasource
280        assert_eq!(pkg.package_type, Some(PackageType::Readme));
281        assert_eq!(pkg.datasource_id, Some(DatasourceId::Readme));
282    }
283}
284
285crate::register_parser!(
286    "Third-party attribution README files",
287    &[
288        "**/README.android",
289        "**/README.chromium",
290        "**/README.facebook",
291        "**/README.google",
292        "**/README.thirdparty"
293    ],
294    "readme",
295    "",
296    Some(
297        "https://chromium.googlesource.com/chromium/src/+/HEAD/docs/contributing.md#third_party-components"
298    ),
299);