Skip to main content

provenant/utils/
font.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use std::collections::BTreeSet;
5use std::path::Path;
6
7use allsorts::binary::read::ReadScope;
8use allsorts::font_data::FontData;
9use allsorts::tables::{FontTableProvider, NameTable, OpenTypeData};
10use ttf_parser::{Face, Permissions, fonts_in_collection, name_id};
11
12pub(crate) const SUPPORTED_FONT_EXTENSIONS: &[&str] =
13    &["ttf", "otf", "woff", "woff2", "eot", "ttc", "otc"];
14pub(crate) const SUPPORTED_FONT_FILE_GLOBS: &[&str] = &[
15    "**/*.ttf",
16    "**/*.otf",
17    "**/*.woff",
18    "**/*.woff2",
19    "**/*.eot",
20    "**/*.ttc",
21    "**/*.otc",
22];
23const OFL_URL_CANONICALIZATIONS: &[(&str, &str)] = &[
24    ("https://scripts.sil.org/OFL/", "http://scripts.sil.org/OFL"),
25    ("https://scripts.sil.org/OFL", "http://scripts.sil.org/OFL"),
26    ("https://openfontlicense.org/", "http://scripts.sil.org/OFL"),
27    ("https://openfontlicense.org", "http://scripts.sil.org/OFL"),
28];
29const ALLSORTS_NAME_TABLE_TAG: u32 = u32::from_be_bytes(*b"name");
30
31crate::register_detection_surface!(
32    "Embedded font legal metadata (native fonts, webfonts, and collections)",
33    SUPPORTED_FONT_FILE_GLOBS,
34    "",
35    "",
36    Some("https://learn.microsoft.com/en-us/typography/opentype/spec/name"),
37);
38
39pub(crate) fn is_supported_font_extension(extension: &str) -> bool {
40    SUPPORTED_FONT_EXTENSIONS
41        .iter()
42        .any(|supported| supported.eq_ignore_ascii_case(extension))
43}
44
45pub(crate) fn is_supported_font_path(path: &Path) -> bool {
46    path.extension()
47        .and_then(|ext| ext.to_str())
48        .is_some_and(is_supported_font_extension)
49}
50
51pub(crate) fn extract_font_metadata_text(path: &Path, bytes: &[u8]) -> Option<String> {
52    let extension = path.extension().and_then(|ext| ext.to_str())?;
53    let extension = extension.to_ascii_lowercase();
54    if !is_supported_font_extension(&extension) {
55        return None;
56    }
57
58    match extension.as_str() {
59        "ttf" | "otf" | "woff" | "woff2" | "ttc" | "otc" => extract_sfnt_font_metadata_text(
60            bytes,
61            matches!(extension.as_str(), "ttf" | "otf" | "ttc" | "otc"),
62        ),
63        "eot" => extract_eot_metadata_text(bytes),
64        _ => None,
65    }
66}
67
68fn extract_sfnt_font_metadata_text(bytes: &[u8], include_permissions: bool) -> Option<String> {
69    let mut lines = Vec::new();
70    let mut seen = BTreeSet::new();
71
72    for line in extract_allsorts_name_table_lines(bytes) {
73        if seen.insert(line.clone()) {
74            lines.push(line);
75        }
76    }
77
78    if include_permissions {
79        let face_count = fonts_in_collection(bytes).unwrap_or(1);
80        for face_index in 0..face_count {
81            let Some(permissions) = Face::parse(bytes, face_index).ok()?.permissions() else {
82                continue;
83            };
84            let line = format!(
85                "Embedding permissions: {}",
86                font_permission_label(permissions)
87            );
88            if seen.insert(line.clone()) {
89                lines.push(line);
90            }
91        }
92    }
93
94    (!lines.is_empty()).then(|| lines.join("\n"))
95}
96
97fn extract_allsorts_name_table_lines(bytes: &[u8]) -> Vec<String> {
98    let Some(font_data) = ReadScope::new(bytes).read::<FontData<'_>>().ok() else {
99        return Vec::new();
100    };
101
102    let mut lines = Vec::new();
103    let mut seen = BTreeSet::new();
104    for face_index in 0..allsorts_face_count(&font_data) {
105        let Ok(provider) = font_data.table_provider(face_index) else {
106            continue;
107        };
108        let Ok(name_table_data) = provider.read_table_data(ALLSORTS_NAME_TABLE_TAG) else {
109            continue;
110        };
111        let Ok(name_table) = ReadScope::new(name_table_data.as_ref()).read::<NameTable<'_>>()
112        else {
113            continue;
114        };
115
116        for (source_name_id, target_name_id) in [
117            (NameTable::COPYRIGHT_NOTICE, name_id::COPYRIGHT_NOTICE),
118            (NameTable::LICENSE_DESCRIPTION, name_id::LICENSE),
119            (NameTable::LICENSE_INFO_URL, name_id::LICENSE_URL),
120        ] {
121            let Some(value) = name_table.string_for_id(source_name_id) else {
122                continue;
123            };
124            let Some(line) = build_font_metadata_line(target_name_id, value) else {
125                continue;
126            };
127            if seen.insert(line.clone()) {
128                lines.push(line);
129            }
130        }
131    }
132
133    lines
134}
135
136fn allsorts_face_count(font_data: &FontData<'_>) -> usize {
137    match font_data {
138        FontData::OpenType(font) => match &font.data {
139            OpenTypeData::Single(_) => 1,
140            OpenTypeData::Collection(ttc) => ttc.offset_tables.len(),
141        },
142        FontData::Woff(_) => 1,
143        FontData::Woff2(font) => font
144            .collection_directory
145            .as_ref()
146            .map(|directory| directory.fonts().count())
147            .unwrap_or(1),
148    }
149}
150
151fn extract_eot_metadata_text(bytes: &[u8]) -> Option<String> {
152    let text = extract_eot_utf16le_marker_text(bytes).join("\n");
153    if text.is_empty() {
154        return None;
155    }
156
157    let mut lines = Vec::new();
158    let mut seen = BTreeSet::new();
159    for segment in split_eot_legal_metadata_segments(&text) {
160        let normalized = normalize_eot_metadata_segment(&segment);
161        if normalized.is_empty() {
162            continue;
163        }
164        if seen.insert(normalized.clone()) {
165            lines.push(normalized);
166        }
167    }
168
169    (!lines.is_empty()).then(|| lines.join("\n"))
170}
171
172fn extract_eot_utf16le_marker_text(bytes: &[u8]) -> Vec<String> {
173    let mut lines = Vec::new();
174    let mut seen = BTreeSet::new();
175    for marker in [
176        "Copyright",
177        "This Font Software is licensed under",
178        "http://",
179        "https://",
180    ] {
181        let encoded = marker.encode_utf16().collect::<Vec<_>>();
182        let marker_bytes = encoded
183            .iter()
184            .flat_map(|unit| unit.to_le_bytes())
185            .collect::<Vec<_>>();
186        let mut search_start = 0;
187        while let Some(relative_start) = bytes[search_start..]
188            .windows(marker_bytes.len())
189            .position(|window| window == marker_bytes.as_slice())
190        {
191            let start = search_start + relative_start;
192            let decoded = decode_utf16le_ascii_from_offset(bytes, start);
193            if !decoded.is_empty() && seen.insert(decoded.clone()) {
194                lines.push(decoded);
195            }
196            search_start = start + marker_bytes.len();
197        }
198    }
199    lines
200}
201
202fn decode_utf16le_ascii_from_offset(bytes: &[u8], start: usize) -> String {
203    let mut decoded = Vec::new();
204    let mut index = start;
205    while index + 1 < bytes.len() {
206        let lo = bytes[index];
207        let hi = bytes[index + 1];
208        if hi == 0 && (0x20..=0x7E).contains(&lo) {
209            decoded.push(lo);
210            index += 2;
211            continue;
212        }
213        break;
214    }
215    String::from_utf8_lossy(&decoded).into_owned()
216}
217
218fn split_eot_legal_metadata_segments(text: &str) -> Vec<String> {
219    let mut segments = Vec::new();
220
221    if let Some(segment) = extract_text_between_markers(
222        text,
223        "Copyright",
224        &["All Rights Reserved.", "All rights reserved."],
225    ) {
226        segments.push(segment);
227    }
228    if let Some(segment) = extract_text_between_markers(
229        text,
230        "This Font Software is licensed under",
231        &[
232            "governing your use of this Font Software.",
233            "This Font Software.",
234        ],
235    ) {
236        segments.push(segment);
237    }
238    segments.extend(extract_http_segments(text));
239
240    segments
241}
242
243fn extract_text_between_markers(
244    text: &str,
245    start_marker: &str,
246    end_markers: &[&str],
247) -> Option<String> {
248    let start = text.find(start_marker)?;
249    let tail = &text[start..];
250    let end = end_markers
251        .iter()
252        .filter_map(|marker| tail.find(marker).map(|idx| idx + marker.len()))
253        .min()
254        .unwrap_or(tail.len());
255    Some(tail[..end].to_string())
256}
257
258fn extract_http_segments(text: &str) -> Vec<String> {
259    let mut segments = Vec::new();
260    for marker in ["http://", "https://"] {
261        let mut search_start = 0;
262        while let Some(relative_start) = text[search_start..].find(marker) {
263            let start = search_start + relative_start;
264            let tail = &text[start + marker.len()..];
265            let mut end = text.len();
266            for boundary in [
267                "http://",
268                "https://",
269                "This Font Software",
270                "Copyright",
271                "Version ",
272            ] {
273                if let Some(relative_end) = tail.find(boundary) {
274                    end = end.min(start + marker.len() + relative_end);
275                }
276            }
277            if let Some(relative_end) = tail.find(char::is_whitespace) {
278                end = end.min(start + marker.len() + relative_end);
279            }
280
281            let segment = text[start..end]
282                .trim_end_matches(&['.', ',', ';', ':'][..])
283                .to_string();
284            if !segment.is_empty() {
285                segments.push(segment);
286            }
287            search_start = end.max(start + marker.len());
288        }
289    }
290    segments
291}
292
293fn normalize_eot_metadata_segment(segment: &str) -> String {
294    let normalized = segment
295        .split_whitespace()
296        .collect::<Vec<_>>()
297        .join(" ")
298        .trim()
299        .to_string();
300
301    if normalized.is_empty() {
302        return normalized;
303    }
304
305    let lowered = normalized.to_ascii_lowercase();
306    if lowered.starts_with("http://") || lowered.starts_with("https://") {
307        return canonicalize_ofl_license_reference_urls(normalized);
308    }
309
310    if lowered.contains("font software") || lowered.contains("open font license") {
311        return canonicalize_ofl_license_reference_urls(normalized);
312    }
313
314    normalized
315}
316
317fn build_font_metadata_line(name_id_value: u16, value: String) -> Option<String> {
318    let value = normalize_font_value(name_id_value, value);
319    if value.is_empty() {
320        return None;
321    }
322
323    if name_id_value == name_id::COPYRIGHT_NOTICE {
324        return Some(value);
325    }
326
327    let label = font_name_label(name_id_value)?;
328    Some(format!("{label}: {value}"))
329}
330
331fn font_name_label(name_id_value: u16) -> Option<&'static str> {
332    match name_id_value {
333        name_id::LICENSE => Some("License Description"),
334        name_id::LICENSE_URL => Some("License Info URL"),
335        _ => None,
336    }
337}
338
339fn normalize_font_value(name_id_value: u16, value: String) -> String {
340    let normalized = value
341        .split_whitespace()
342        .collect::<Vec<_>>()
343        .join(" ")
344        .trim()
345        .to_string();
346
347    match name_id_value {
348        name_id::COPYRIGHT_NOTICE => strip_reserved_font_name_clause(normalized),
349        name_id::LICENSE | name_id::LICENSE_URL => {
350            canonicalize_ofl_license_reference_urls(normalized)
351        }
352        _ => normalized,
353    }
354}
355
356fn strip_reserved_font_name_clause(value: String) -> String {
357    let lower = value.to_ascii_lowercase();
358    for marker in [
359        ", with reserved font name",
360        ", with no reserved font name",
361        " with reserved font name",
362        " with no reserved font name",
363    ] {
364        if let Some(index) = lower.find(marker) {
365            return value[..index]
366                .trim_end_matches(&[',', ';', ':', ' ', '('][..])
367                .trim()
368                .to_string();
369        }
370    }
371
372    value
373}
374
375fn canonicalize_ofl_license_reference_urls(mut value: String) -> String {
376    for (from, to) in OFL_URL_CANONICALIZATIONS {
377        value = value.replace(from, to);
378    }
379    value
380}
381
382fn font_permission_label(permission: Permissions) -> &'static str {
383    match permission {
384        Permissions::Installable => "Installable",
385        Permissions::Restricted => "Restricted",
386        Permissions::PreviewAndPrint => "Preview and Print",
387        Permissions::Editable => "Editable",
388    }
389}
390
391#[cfg(test)]
392mod tests {
393    use std::fs;
394    use std::path::Path;
395
396    use crate::copyright::detect_copyrights;
397    use crate::license_detection::LicenseDetectionEngine;
398    use ttf_parser::name_id;
399
400    use super::{
401        build_font_metadata_line, canonicalize_ofl_license_reference_urls,
402        extract_font_metadata_text,
403    };
404
405    #[test]
406    fn extracts_ofl_metadata_from_lato_font_fixture() {
407        let bytes =
408            fs::read("testdata/font-fixtures/Lato-Bold.ttf").expect("read lato font fixture");
409
410        let text = extract_font_metadata_text(Path::new("Lato-Bold.ttf"), &bytes)
411            .expect("font metadata text");
412
413        assert!(text.contains("License Description:"), "{text}");
414        assert!(
415            text.contains("Open Font License") || text.contains("OFL"),
416            "{text}"
417        );
418    }
419
420    #[test]
421    fn extracts_apache_metadata_from_underline_test_font_fixture() {
422        let bytes = fs::read("testdata/font-fixtures/UnderlineTest-Close.ttf")
423            .expect("read apache font fixture");
424
425        let text = extract_font_metadata_text(Path::new("UnderlineTest-Close.ttf"), &bytes)
426            .expect("font metadata text");
427
428        assert!(
429            text.contains("License Description:") || text.contains("Copyright"),
430            "{text}"
431        );
432        assert!(
433            text.contains("Apache") || text.contains("http://www.apache.org/licenses"),
434            "{text}"
435        );
436    }
437
438    #[test]
439    fn canonicalizes_ofl_url_variants_in_font_license_metadata() {
440        let canonical = canonicalize_ofl_license_reference_urls(
441            "This license is available with a FAQ at: https://openfontlicense.org/".to_string(),
442        );
443
444        assert_eq!(
445            canonical,
446            "This license is available with a FAQ at: http://scripts.sil.org/OFL"
447        );
448    }
449
450    #[test]
451    fn font_metadata_lines_detect_noto_ofl_text_without_trademark_noise() {
452        let metadata_text = [
453            build_font_metadata_line(
454                name_id::COPYRIGHT_NOTICE,
455                "Copyright 2022 The Noto Project Authors (https://github.com/notofonts/latin-greek-cyrillic)".to_string(),
456            ),
457            build_font_metadata_line(
458                name_id::TRADEMARK,
459                "Noto is a trademark of Google LLC.".to_string(),
460            ),
461            build_font_metadata_line(
462                name_id::LICENSE,
463                "This Font Software is licensed under the SIL Open Font License, Version 1.1. This license is available with a FAQ at: https://scripts.sil.org/OFL".to_string(),
464            ),
465            build_font_metadata_line(
466                name_id::LICENSE_URL,
467                "https://scripts.sil.org/OFL".to_string(),
468            ),
469        ]
470        .into_iter()
471        .flatten()
472        .collect::<Vec<_>>()
473        .join("\n");
474
475        assert!(!metadata_text.contains("Trademark:"), "{metadata_text}");
476        assert!(
477            metadata_text.contains("Copyright 2022 The Noto Project Authors"),
478            "{metadata_text}"
479        );
480        assert!(
481            metadata_text.contains("http://scripts.sil.org/OFL"),
482            "{metadata_text}"
483        );
484
485        let engine = LicenseDetectionEngine::from_embedded().expect("initialize license engine");
486        let detections = engine
487            .detect_with_kind_and_source_with_score(&metadata_text, false, false, "font.ttf", 0.0)
488            .expect("detect licenses from font metadata text");
489
490        assert!(
491            detections.iter().any(|detection| {
492                detection
493                    .license_expression_spdx
494                    .as_deref()
495                    .is_some_and(|expression| expression.contains("OFL-1.1"))
496            }),
497            "detections: {detections:#?}"
498        );
499
500        let (copyrights, holders, _authors) = detect_copyrights(&metadata_text, None);
501        assert!(
502            copyrights.iter().any(|detection| {
503                detection.copyright
504                    == "Copyright 2022 The Noto Project Authors (https://github.com/notofonts/latin-greek-cyrillic)"
505            }),
506            "copyrights: {copyrights:#?}"
507        );
508        assert!(
509            holders
510                .iter()
511                .any(|detection| detection.holder == "The Noto Project Authors"),
512            "holders: {holders:#?}"
513        );
514    }
515
516    #[test]
517    fn extracts_metadata_from_sourcecodepro_woff_fixture() {
518        let bytes = fs::read("testdata/font-fixtures/SourceCodePro-Regular.otf.woff")
519            .expect("read woff font fixture");
520
521        let text = extract_font_metadata_text(Path::new("SourceCodePro-Regular.otf.woff"), &bytes)
522            .expect("woff font metadata text");
523
524        assert!(text.contains("Adobe"), "{text}");
525        assert!(
526            text.contains("Open Font License") || text.contains("OFL"),
527            "{text}"
528        );
529        assert!(text.contains("http://scripts.sil.org/OFL"), "{text}");
530    }
531
532    #[test]
533    fn extracts_metadata_from_sourcecodepro_woff2_fixture() {
534        let bytes = fs::read("testdata/font-fixtures/SourceCodePro-Regular.otf.woff2")
535            .expect("read woff2 font fixture");
536
537        let text = extract_font_metadata_text(Path::new("SourceCodePro-Regular.otf.woff2"), &bytes)
538            .expect("woff2 font metadata text");
539
540        assert!(text.contains("Adobe"), "{text}");
541        assert!(
542            text.contains("Open Font License") || text.contains("OFL"),
543            "{text}"
544        );
545        assert!(text.contains("http://scripts.sil.org/OFL"), "{text}");
546    }
547
548    #[test]
549    fn extracts_legal_strings_from_notosans_eot_fixture() {
550        let bytes =
551            fs::read("testdata/font-fixtures/NotoSans-Regular.eot").expect("read eot font fixture");
552
553        let text = extract_font_metadata_text(Path::new("NotoSans-Regular.eot"), &bytes)
554            .expect("eot font metadata text");
555
556        assert!(text.contains("Copyright 2015 Google Inc."), "{text}");
557        assert!(
558            text.contains("This Font Software is licensed under the SIL Open Font License"),
559            "{text}"
560        );
561        assert!(text.contains("http://scripts.sil.org/OFL"), "{text}");
562    }
563
564    #[test]
565    fn wrapped_font_metadata_detects_sourcecodepro_ofl_without_reserved_font_tail() {
566        let bytes = fs::read("testdata/font-fixtures/SourceCodePro-Regular.otf.woff")
567            .expect("read woff font fixture");
568        let metadata_text =
569            extract_font_metadata_text(Path::new("SourceCodePro-Regular.otf.woff"), &bytes)
570                .expect("wrapped font metadata text");
571
572        let engine = LicenseDetectionEngine::from_embedded().expect("initialize license engine");
573        let detections = engine
574            .detect_with_kind_and_source_with_score(&metadata_text, false, false, "font.woff", 0.0)
575            .expect("detect licenses from wrapped font metadata text");
576        assert!(
577            detections.iter().any(|detection| {
578                detection
579                    .license_expression_spdx
580                    .as_deref()
581                    .is_some_and(|expression| expression.contains("OFL-1.1"))
582            }),
583            "detections: {detections:#?}"
584        );
585
586        let (copyrights, holders, _authors) = detect_copyrights(&metadata_text, None);
587        assert!(
588            copyrights.iter().any(|detection| {
589                detection.copyright == "(c) 2023 Adobe (http://www.adobe.com/)"
590            }),
591            "copyrights: {copyrights:#?}"
592        );
593        assert!(
594            holders.iter().any(|detection| detection.holder == "Adobe"),
595            "holders: {holders:#?}"
596        );
597    }
598
599    #[test]
600    fn extracts_metadata_from_ttc_fixture() {
601        let bytes = fs::read("testdata/font-fixtures/TTC.ttc").expect("read ttc font fixture");
602
603        let text = extract_font_metadata_text(Path::new("TTC.ttc"), &bytes)
604            .expect("ttc font metadata text");
605
606        assert!(
607            text.contains("Copyright") || text.contains("License"),
608            "{text}"
609        );
610        assert!(text.contains("No rights reserved"), "{text}");
611    }
612}