Skip to main content

provenant/utils/
font.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use std::collections::BTreeSet;
5use std::path::Path;
6
7use allsorts::binary::read::ReadScope;
8use allsorts::font_data::FontData;
9use allsorts::tables::{FontTableProvider, NameTable, OpenTypeData};
10use ttf_parser::{Face, Permissions, fonts_in_collection, name_id};
11
12use crate::parsers::metadata::ParserMetadata;
13
14pub(crate) const SUPPORTED_FONT_EXTENSIONS: &[&str] =
15    &["ttf", "otf", "woff", "woff2", "eot", "ttc", "otc"];
16pub(crate) const SUPPORTED_FONT_FILE_GLOBS: &[&str] = &[
17    "**/*.ttf",
18    "**/*.otf",
19    "**/*.woff",
20    "**/*.woff2",
21    "**/*.eot",
22    "**/*.ttc",
23    "**/*.otc",
24];
25const OFL_URL_CANONICALIZATIONS: &[(&str, &str)] = &[
26    ("https://scripts.sil.org/OFL/", "http://scripts.sil.org/OFL"),
27    ("https://scripts.sil.org/OFL", "http://scripts.sil.org/OFL"),
28    ("https://openfontlicense.org/", "http://scripts.sil.org/OFL"),
29    ("https://openfontlicense.org", "http://scripts.sil.org/OFL"),
30];
31const ALLSORTS_NAME_TABLE_TAG: u32 = u32::from_be_bytes(*b"name");
32
33pub(crate) static FONT_METADATA: &[ParserMetadata] = &[ParserMetadata {
34    description: "Embedded font legal metadata (native fonts, webfonts, and collections)",
35    file_patterns: SUPPORTED_FONT_FILE_GLOBS,
36    package_type: "",
37    primary_language: "",
38    documentation_url: Some("https://learn.microsoft.com/en-us/typography/opentype/spec/name"),
39}];
40
41pub(crate) fn is_supported_font_extension(extension: &str) -> bool {
42    SUPPORTED_FONT_EXTENSIONS
43        .iter()
44        .any(|supported| supported.eq_ignore_ascii_case(extension))
45}
46
47pub(crate) fn is_supported_font_path(path: &Path) -> bool {
48    path.extension()
49        .and_then(|ext| ext.to_str())
50        .is_some_and(is_supported_font_extension)
51}
52
53pub(crate) fn extract_font_metadata_text(path: &Path, bytes: &[u8]) -> Option<String> {
54    let extension = path.extension().and_then(|ext| ext.to_str())?;
55    let extension = extension.to_ascii_lowercase();
56    if !is_supported_font_extension(&extension) {
57        return None;
58    }
59
60    match extension.as_str() {
61        "ttf" | "otf" | "woff" | "woff2" | "ttc" | "otc" => extract_sfnt_font_metadata_text(
62            bytes,
63            matches!(extension.as_str(), "ttf" | "otf" | "ttc" | "otc"),
64        ),
65        "eot" => extract_eot_metadata_text(bytes),
66        _ => None,
67    }
68}
69
70fn extract_sfnt_font_metadata_text(bytes: &[u8], include_permissions: bool) -> Option<String> {
71    let mut lines = Vec::new();
72    let mut seen = BTreeSet::new();
73
74    for line in extract_allsorts_name_table_lines(bytes) {
75        if seen.insert(line.clone()) {
76            lines.push(line);
77        }
78    }
79
80    if include_permissions {
81        let face_count = fonts_in_collection(bytes).unwrap_or(1);
82        for face_index in 0..face_count {
83            let Some(permissions) = Face::parse(bytes, face_index).ok()?.permissions() else {
84                continue;
85            };
86            let line = format!(
87                "Embedding permissions: {}",
88                font_permission_label(permissions)
89            );
90            if seen.insert(line.clone()) {
91                lines.push(line);
92            }
93        }
94    }
95
96    (!lines.is_empty()).then(|| lines.join("\n"))
97}
98
99fn extract_allsorts_name_table_lines(bytes: &[u8]) -> Vec<String> {
100    let Some(font_data) = ReadScope::new(bytes).read::<FontData<'_>>().ok() else {
101        return Vec::new();
102    };
103
104    let mut lines = Vec::new();
105    let mut seen = BTreeSet::new();
106    for face_index in 0..allsorts_face_count(&font_data) {
107        let Ok(provider) = font_data.table_provider(face_index) else {
108            continue;
109        };
110        let Ok(name_table_data) = provider.read_table_data(ALLSORTS_NAME_TABLE_TAG) else {
111            continue;
112        };
113        let Ok(name_table) = ReadScope::new(name_table_data.as_ref()).read::<NameTable<'_>>()
114        else {
115            continue;
116        };
117
118        for (source_name_id, target_name_id) in [
119            (NameTable::COPYRIGHT_NOTICE, name_id::COPYRIGHT_NOTICE),
120            (NameTable::LICENSE_DESCRIPTION, name_id::LICENSE),
121            (NameTable::LICENSE_INFO_URL, name_id::LICENSE_URL),
122        ] {
123            let Some(value) = name_table.string_for_id(source_name_id) else {
124                continue;
125            };
126            let Some(line) = build_font_metadata_line(target_name_id, value) else {
127                continue;
128            };
129            if seen.insert(line.clone()) {
130                lines.push(line);
131            }
132        }
133    }
134
135    lines
136}
137
138fn allsorts_face_count(font_data: &FontData<'_>) -> usize {
139    match font_data {
140        FontData::OpenType(font) => match &font.data {
141            OpenTypeData::Single(_) => 1,
142            OpenTypeData::Collection(ttc) => ttc.offset_tables.len(),
143        },
144        FontData::Woff(_) => 1,
145        FontData::Woff2(font) => font
146            .collection_directory
147            .as_ref()
148            .map(|directory| directory.fonts().count())
149            .unwrap_or(1),
150    }
151}
152
153fn extract_eot_metadata_text(bytes: &[u8]) -> Option<String> {
154    let text = extract_eot_utf16le_marker_text(bytes).join("\n");
155    if text.is_empty() {
156        return None;
157    }
158
159    let mut lines = Vec::new();
160    let mut seen = BTreeSet::new();
161    for segment in split_eot_legal_metadata_segments(&text) {
162        let normalized = normalize_eot_metadata_segment(&segment);
163        if normalized.is_empty() {
164            continue;
165        }
166        if seen.insert(normalized.clone()) {
167            lines.push(normalized);
168        }
169    }
170
171    (!lines.is_empty()).then(|| lines.join("\n"))
172}
173
174fn extract_eot_utf16le_marker_text(bytes: &[u8]) -> Vec<String> {
175    let mut lines = Vec::new();
176    let mut seen = BTreeSet::new();
177    for marker in [
178        "Copyright",
179        "This Font Software is licensed under",
180        "http://",
181        "https://",
182    ] {
183        let encoded = marker.encode_utf16().collect::<Vec<_>>();
184        let marker_bytes = encoded
185            .iter()
186            .flat_map(|unit| unit.to_le_bytes())
187            .collect::<Vec<_>>();
188        let mut search_start = 0;
189        while let Some(relative_start) = bytes[search_start..]
190            .windows(marker_bytes.len())
191            .position(|window| window == marker_bytes.as_slice())
192        {
193            let start = search_start + relative_start;
194            let decoded = decode_utf16le_ascii_from_offset(bytes, start);
195            if !decoded.is_empty() && seen.insert(decoded.clone()) {
196                lines.push(decoded);
197            }
198            search_start = start + marker_bytes.len();
199        }
200    }
201    lines
202}
203
204fn decode_utf16le_ascii_from_offset(bytes: &[u8], start: usize) -> String {
205    let mut decoded = Vec::new();
206    let mut index = start;
207    while index + 1 < bytes.len() {
208        let lo = bytes[index];
209        let hi = bytes[index + 1];
210        if hi == 0 && (0x20..=0x7E).contains(&lo) {
211            decoded.push(lo);
212            index += 2;
213            continue;
214        }
215        break;
216    }
217    String::from_utf8_lossy(&decoded).into_owned()
218}
219
220fn split_eot_legal_metadata_segments(text: &str) -> Vec<String> {
221    let mut segments = Vec::new();
222
223    if let Some(segment) = extract_text_between_markers(
224        text,
225        "Copyright",
226        &["All Rights Reserved.", "All rights reserved."],
227    ) {
228        segments.push(segment);
229    }
230    if let Some(segment) = extract_text_between_markers(
231        text,
232        "This Font Software is licensed under",
233        &[
234            "governing your use of this Font Software.",
235            "This Font Software.",
236        ],
237    ) {
238        segments.push(segment);
239    }
240    segments.extend(extract_http_segments(text));
241
242    segments
243}
244
245fn extract_text_between_markers(
246    text: &str,
247    start_marker: &str,
248    end_markers: &[&str],
249) -> Option<String> {
250    let start = text.find(start_marker)?;
251    let tail = &text[start..];
252    let end = end_markers
253        .iter()
254        .filter_map(|marker| tail.find(marker).map(|idx| idx + marker.len()))
255        .min()
256        .unwrap_or(tail.len());
257    Some(tail[..end].to_string())
258}
259
260fn extract_http_segments(text: &str) -> Vec<String> {
261    let mut segments = Vec::new();
262    for marker in ["http://", "https://"] {
263        let mut search_start = 0;
264        while let Some(relative_start) = text[search_start..].find(marker) {
265            let start = search_start + relative_start;
266            let tail = &text[start + marker.len()..];
267            let mut end = text.len();
268            for boundary in [
269                "http://",
270                "https://",
271                "This Font Software",
272                "Copyright",
273                "Version ",
274            ] {
275                if let Some(relative_end) = tail.find(boundary) {
276                    end = end.min(start + marker.len() + relative_end);
277                }
278            }
279            if let Some(relative_end) = tail.find(char::is_whitespace) {
280                end = end.min(start + marker.len() + relative_end);
281            }
282
283            let segment = text[start..end]
284                .trim_end_matches(&['.', ',', ';', ':'][..])
285                .to_string();
286            if !segment.is_empty() {
287                segments.push(segment);
288            }
289            search_start = end.max(start + marker.len());
290        }
291    }
292    segments
293}
294
295fn normalize_eot_metadata_segment(segment: &str) -> String {
296    let normalized = segment
297        .split_whitespace()
298        .collect::<Vec<_>>()
299        .join(" ")
300        .trim()
301        .to_string();
302
303    if normalized.is_empty() {
304        return normalized;
305    }
306
307    let lowered = normalized.to_ascii_lowercase();
308    if lowered.starts_with("http://") || lowered.starts_with("https://") {
309        return canonicalize_ofl_license_reference_urls(normalized);
310    }
311
312    if lowered.contains("font software") || lowered.contains("open font license") {
313        return canonicalize_ofl_license_reference_urls(normalized);
314    }
315
316    normalized
317}
318
319fn build_font_metadata_line(name_id_value: u16, value: String) -> Option<String> {
320    let value = normalize_font_value(name_id_value, value);
321    if value.is_empty() {
322        return None;
323    }
324
325    if name_id_value == name_id::COPYRIGHT_NOTICE {
326        return Some(value);
327    }
328
329    let label = font_name_label(name_id_value)?;
330    Some(format!("{label}: {value}"))
331}
332
333fn font_name_label(name_id_value: u16) -> Option<&'static str> {
334    match name_id_value {
335        name_id::LICENSE => Some("License Description"),
336        name_id::LICENSE_URL => Some("License Info URL"),
337        _ => None,
338    }
339}
340
341fn normalize_font_value(name_id_value: u16, value: String) -> String {
342    let normalized = value
343        .split_whitespace()
344        .collect::<Vec<_>>()
345        .join(" ")
346        .trim()
347        .to_string();
348
349    match name_id_value {
350        name_id::COPYRIGHT_NOTICE => strip_reserved_font_name_clause(normalized),
351        name_id::LICENSE | name_id::LICENSE_URL => {
352            canonicalize_ofl_license_reference_urls(normalized)
353        }
354        _ => normalized,
355    }
356}
357
358fn strip_reserved_font_name_clause(value: String) -> String {
359    let lower = value.to_ascii_lowercase();
360    for marker in [
361        ", with reserved font name",
362        ", with no reserved font name",
363        " with reserved font name",
364        " with no reserved font name",
365    ] {
366        if let Some(index) = lower.find(marker) {
367            return value[..index]
368                .trim_end_matches(&[',', ';', ':', ' ', '('][..])
369                .trim()
370                .to_string();
371        }
372    }
373
374    value
375}
376
377fn canonicalize_ofl_license_reference_urls(mut value: String) -> String {
378    for (from, to) in OFL_URL_CANONICALIZATIONS {
379        value = value.replace(from, to);
380    }
381    value
382}
383
384fn font_permission_label(permission: Permissions) -> &'static str {
385    match permission {
386        Permissions::Installable => "Installable",
387        Permissions::Restricted => "Restricted",
388        Permissions::PreviewAndPrint => "Preview and Print",
389        Permissions::Editable => "Editable",
390    }
391}
392
393#[cfg(test)]
394mod tests {
395    use std::fs;
396    use std::path::Path;
397
398    use crate::copyright::detect_copyrights;
399    use crate::license_detection::LicenseDetectionEngine;
400    use ttf_parser::name_id;
401
402    use super::{
403        build_font_metadata_line, canonicalize_ofl_license_reference_urls,
404        extract_font_metadata_text,
405    };
406
407    #[test]
408    fn extracts_ofl_metadata_from_lato_font_fixture() {
409        let bytes =
410            fs::read("testdata/font-fixtures/Lato-Bold.ttf").expect("read lato font fixture");
411
412        let text = extract_font_metadata_text(Path::new("Lato-Bold.ttf"), &bytes)
413            .expect("font metadata text");
414
415        assert!(text.contains("License Description:"), "{text}");
416        assert!(
417            text.contains("Open Font License") || text.contains("OFL"),
418            "{text}"
419        );
420    }
421
422    #[test]
423    fn extracts_apache_metadata_from_underline_test_font_fixture() {
424        let bytes = fs::read("testdata/font-fixtures/UnderlineTest-Close.ttf")
425            .expect("read apache font fixture");
426
427        let text = extract_font_metadata_text(Path::new("UnderlineTest-Close.ttf"), &bytes)
428            .expect("font metadata text");
429
430        assert!(
431            text.contains("License Description:") || text.contains("Copyright"),
432            "{text}"
433        );
434        assert!(
435            text.contains("Apache") || text.contains("http://www.apache.org/licenses"),
436            "{text}"
437        );
438    }
439
440    #[test]
441    fn canonicalizes_ofl_url_variants_in_font_license_metadata() {
442        let canonical = canonicalize_ofl_license_reference_urls(
443            "This license is available with a FAQ at: https://openfontlicense.org/".to_string(),
444        );
445
446        assert_eq!(
447            canonical,
448            "This license is available with a FAQ at: http://scripts.sil.org/OFL"
449        );
450    }
451
452    #[test]
453    fn font_metadata_lines_detect_noto_ofl_text_without_trademark_noise() {
454        let metadata_text = [
455            build_font_metadata_line(
456                name_id::COPYRIGHT_NOTICE,
457                "Copyright 2022 The Noto Project Authors (https://github.com/notofonts/latin-greek-cyrillic)".to_string(),
458            ),
459            build_font_metadata_line(
460                name_id::TRADEMARK,
461                "Noto is a trademark of Google LLC.".to_string(),
462            ),
463            build_font_metadata_line(
464                name_id::LICENSE,
465                "This Font Software is licensed under the SIL Open Font License, Version 1.1. This license is available with a FAQ at: https://scripts.sil.org/OFL".to_string(),
466            ),
467            build_font_metadata_line(
468                name_id::LICENSE_URL,
469                "https://scripts.sil.org/OFL".to_string(),
470            ),
471        ]
472        .into_iter()
473        .flatten()
474        .collect::<Vec<_>>()
475        .join("\n");
476
477        assert!(!metadata_text.contains("Trademark:"), "{metadata_text}");
478        assert!(
479            metadata_text.contains("Copyright 2022 The Noto Project Authors"),
480            "{metadata_text}"
481        );
482        assert!(
483            metadata_text.contains("http://scripts.sil.org/OFL"),
484            "{metadata_text}"
485        );
486
487        let engine = LicenseDetectionEngine::from_embedded().expect("initialize license engine");
488        let detections = engine
489            .detect_with_kind_and_source_with_score(&metadata_text, false, false, "font.ttf", 0.0)
490            .expect("detect licenses from font metadata text");
491
492        assert!(
493            detections.iter().any(|detection| {
494                detection
495                    .license_expression_spdx
496                    .as_deref()
497                    .is_some_and(|expression| expression.contains("OFL-1.1"))
498            }),
499            "detections: {detections:#?}"
500        );
501
502        let (copyrights, holders, _authors) = detect_copyrights(&metadata_text, None);
503        assert!(
504            copyrights.iter().any(|detection| {
505                detection.copyright
506                    == "Copyright 2022 The Noto Project Authors (https://github.com/notofonts/latin-greek-cyrillic)"
507            }),
508            "copyrights: {copyrights:#?}"
509        );
510        assert!(
511            holders
512                .iter()
513                .any(|detection| detection.holder == "The Noto Project Authors"),
514            "holders: {holders:#?}"
515        );
516    }
517
518    #[test]
519    fn extracts_metadata_from_sourcecodepro_woff_fixture() {
520        let bytes = fs::read("testdata/font-fixtures/SourceCodePro-Regular.otf.woff")
521            .expect("read woff font fixture");
522
523        let text = extract_font_metadata_text(Path::new("SourceCodePro-Regular.otf.woff"), &bytes)
524            .expect("woff font metadata text");
525
526        assert!(text.contains("Adobe"), "{text}");
527        assert!(
528            text.contains("Open Font License") || text.contains("OFL"),
529            "{text}"
530        );
531        assert!(text.contains("http://scripts.sil.org/OFL"), "{text}");
532    }
533
534    #[test]
535    fn extracts_metadata_from_sourcecodepro_woff2_fixture() {
536        let bytes = fs::read("testdata/font-fixtures/SourceCodePro-Regular.otf.woff2")
537            .expect("read woff2 font fixture");
538
539        let text = extract_font_metadata_text(Path::new("SourceCodePro-Regular.otf.woff2"), &bytes)
540            .expect("woff2 font metadata text");
541
542        assert!(text.contains("Adobe"), "{text}");
543        assert!(
544            text.contains("Open Font License") || text.contains("OFL"),
545            "{text}"
546        );
547        assert!(text.contains("http://scripts.sil.org/OFL"), "{text}");
548    }
549
550    #[test]
551    fn extracts_legal_strings_from_notosans_eot_fixture() {
552        let bytes =
553            fs::read("testdata/font-fixtures/NotoSans-Regular.eot").expect("read eot font fixture");
554
555        let text = extract_font_metadata_text(Path::new("NotoSans-Regular.eot"), &bytes)
556            .expect("eot font metadata text");
557
558        assert!(text.contains("Copyright 2015 Google Inc."), "{text}");
559        assert!(
560            text.contains("This Font Software is licensed under the SIL Open Font License"),
561            "{text}"
562        );
563        assert!(text.contains("http://scripts.sil.org/OFL"), "{text}");
564    }
565
566    #[test]
567    fn wrapped_font_metadata_detects_sourcecodepro_ofl_without_reserved_font_tail() {
568        let bytes = fs::read("testdata/font-fixtures/SourceCodePro-Regular.otf.woff")
569            .expect("read woff font fixture");
570        let metadata_text =
571            extract_font_metadata_text(Path::new("SourceCodePro-Regular.otf.woff"), &bytes)
572                .expect("wrapped font metadata text");
573
574        let engine = LicenseDetectionEngine::from_embedded().expect("initialize license engine");
575        let detections = engine
576            .detect_with_kind_and_source_with_score(&metadata_text, false, false, "font.woff", 0.0)
577            .expect("detect licenses from wrapped font metadata text");
578        assert!(
579            detections.iter().any(|detection| {
580                detection
581                    .license_expression_spdx
582                    .as_deref()
583                    .is_some_and(|expression| expression.contains("OFL-1.1"))
584            }),
585            "detections: {detections:#?}"
586        );
587
588        let (copyrights, holders, _authors) = detect_copyrights(&metadata_text, None);
589        assert!(
590            copyrights.iter().any(|detection| {
591                detection.copyright == "(c) 2023 Adobe (http://www.adobe.com/)"
592            }),
593            "copyrights: {copyrights:#?}"
594        );
595        assert!(
596            holders.iter().any(|detection| detection.holder == "Adobe"),
597            "holders: {holders:#?}"
598        );
599    }
600
601    #[test]
602    fn extracts_metadata_from_ttc_fixture() {
603        let bytes = fs::read("testdata/font-fixtures/TTC.ttc").expect("read ttc font fixture");
604
605        let text = extract_font_metadata_text(Path::new("TTC.ttc"), &bytes)
606            .expect("ttc font metadata text");
607
608        assert!(
609            text.contains("Copyright") || text.contains("License"),
610            "{text}"
611        );
612        assert!(text.contains("No rights reserved"), "{text}");
613    }
614}