Skip to main content

typg_core/
search.rs

1//! Font metadata extraction and search.
2//!
3//! This module opens font files, extracts searchable metadata from their
4//! OpenType tables, and evaluates that metadata against a [`Query`]. The same
5//! metadata model is used by live scans, cache files, and indexed search.
6//!
7//! One file may yield multiple results because collection formats such as TTC
8//! and OTC can store several faces in a single container.
9//!
10//! Made by FontLab <https://www.fontlab.com/>
11use std::fs;
12use std::path::{Path, PathBuf};
13use std::sync::mpsc::Sender;
14
15use anyhow::{Context, Result};
16use rayon::prelude::*;
17use rayon::ThreadPoolBuilder;
18use read_fonts::tables::name::NameId;
19use read_fonts::types::Tag;
20use read_fonts::{FontRef, TableProvider};
21use serde::{Deserialize, Serialize};
22use skrifa::{FontRef as SkrifaFontRef, MetadataProvider};
23
24use crate::discovery::{FontDiscovery, PathDiscovery};
25use crate::query::Query;
26use crate::tags::{tag4, tag_to_string};
27
28/// Everything we know about a single font face, extracted from its binary tables.
29///
30/// One font *file* may contain multiple faces (in a TTC/OTC collection), and
31/// each face gets its own `TypgFontFaceMeta`. This struct is the unit of
32/// comparison — every query filter is evaluated against one of these.
33///
34/// All tag vectors are sorted and deduplicated after extraction, so you can
35/// safely use set-intersection logic against them.
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct TypgFontFaceMeta {
38    /// Human-readable names for this font face.
39    ///
40    /// Collected from the font's `name` table: family name ("Helvetica"),
41    /// typographic family name ("Helvetica Neue"), full name ("Helvetica Neue
42    /// Bold"), PostScript name ("HelveticaNeue-Bold"), and subfamily
43    /// ("Bold"). The file stem (e.g., "HelveticaNeue-Bold" from the filename)
44    /// is always appended as a fallback, because some fonts have empty or
45    /// broken name tables.
46    ///
47    /// Sorted, deduplicated, trimmed of whitespace.
48    pub names: Vec<String>,
49
50    /// Variation axis tags. Empty for static (non-variable) fonts.
51    ///
52    /// Common axes: `wght` (weight: 100=Thin, 400=Regular, 700=Bold, 900=Black),
53    /// `wdth` (width: 75=Condensed, 100=Normal, 125=Expanded),
54    /// `opsz` (optical size: adjusts stroke contrast for small/large rendering),
55    /// `ital` (italic: 0=Upright, 1=Italic),
56    /// `slnt` (slant: oblique angle in degrees).
57    ///
58    /// Read from the font's `fvar` (font variations) table.
59    #[serde(
60        serialize_with = "serialize_tags",
61        deserialize_with = "deserialize_tags"
62    )]
63    pub axis_tags: Vec<Tag>,
64
65    /// OpenType layout feature tags from GSUB and GPOS tables.
66    ///
67    /// These control typographic behavior: `liga` (standard ligatures — fi, fl
68    /// become single glyphs), `smcp` (small capitals), `onum` (oldstyle
69    /// numerals), `kern` (kerning — fine-tuned spacing between specific letter
70    /// pairs), `calt` (contextual alternates), `dlig` (discretionary ligatures).
71    ///
72    /// GSUB features handle glyph *substitution* (replacing one glyph with
73    /// another). GPOS features handle glyph *positioning* (adjusting placement).
74    /// Both are merged here because the query doesn't distinguish them.
75    #[serde(
76        serialize_with = "serialize_tags",
77        deserialize_with = "deserialize_tags"
78    )]
79    pub feature_tags: Vec<Tag>,
80
81    /// Script tags declaring which writing systems this font supports.
82    ///
83    /// Read from GSUB and GPOS script lists. Common values: `latn` (Latin),
84    /// `arab` (Arabic), `cyrl` (Cyrillic), `grek` (Greek), `hani` (CJK
85    /// ideographs), `deva` (Devanagari), `thai` (Thai).
86    ///
87    /// A font can render characters from a script's Unicode range without
88    /// declaring script support here — the script tag means the font has
89    /// *shaping rules* (substitutions, positioning) specifically for that
90    /// writing system.
91    #[serde(
92        serialize_with = "serialize_tags",
93        deserialize_with = "deserialize_tags"
94    )]
95    pub script_tags: Vec<Tag>,
96
97    /// Every top-level table present in the font file.
98    ///
99    /// Useful for structural queries: does this font have `CFF ` (PostScript
100    /// outlines) or `glyf` (TrueType outlines)? Does it have `SVG ` (color
101    /// SVG glyphs) or `COLR` (color layer glyphs)? Does it have `fvar`
102    /// (variable font axes)?
103    ///
104    /// Read directly from the font's table directory — the index at the
105    /// start of every OpenType file.
106    #[serde(
107        serialize_with = "serialize_tags",
108        deserialize_with = "deserialize_tags"
109    )]
110    pub table_tags: Vec<Tag>,
111
112    /// Unicode codepoints this font can render, from its `cmap` table.
113    ///
114    /// The `cmap` (character map) is the font's promise: "give me this
115    /// Unicode codepoint, I'll give you a glyph." If U+00F1 (ñ) is in
116    /// this list, the font has a glyph for it.
117    ///
118    /// Sorted and deduplicated. Can be large — a CJK font may cover
119    /// 20,000+ codepoints.
120    pub codepoints: Vec<char>,
121
122    /// Whether this font has an `fvar` table, making it a variable font.
123    ///
124    /// Variable fonts contain continuous design axes (weight, width, etc.)
125    /// instead of discrete named instances. A single variable font file can
126    /// replace an entire family of static fonts.
127    pub is_variable: bool,
128
129    /// OS/2 `usWeightClass` value. Indicates visual weight on a 1–1000 scale.
130    ///
131    /// Standard values: 100=Thin, 200=ExtraLight, 300=Light, 400=Regular,
132    /// 500=Medium, 600=SemiBold, 700=Bold, 800=ExtraBold, 900=Black.
133    /// `None` if the font has no OS/2 table (rare in modern fonts).
134    #[serde(default)]
135    pub weight_class: Option<u16>,
136
137    /// OS/2 `usWidthClass` value. Indicates visual width on a 1–9 scale.
138    ///
139    /// Values: 1=UltraCondensed, 2=ExtraCondensed, 3=Condensed,
140    /// 4=SemiCondensed, 5=Normal, 6=SemiExpanded, 7=Expanded,
141    /// 8=ExtraExpanded, 9=UltraExpanded.
142    /// `None` if the font has no OS/2 table.
143    #[serde(default)]
144    pub width_class: Option<u16>,
145
146    /// OS/2 `sFamilyClass` split into (major class, subclass).
147    ///
148    /// The major class groups fonts by general style: 0=No classification,
149    /// 1=Oldstyle Serifs, 2=Transitional Serifs, 3=Modern Serifs,
150    /// 4=Clarendon Serifs, 5=Slab Serifs, 7=Freeform Serifs,
151    /// 8=Sans Serif, 9=Ornamentals, 10=Scripts, 12=Symbolic.
152    ///
153    /// The subclass provides finer detail within each major class.
154    /// For example, within Sans Serif (8): 1=IBM Neo-Grotesque Gothic,
155    /// 2=Humanist, 3=Low-x Round Geometric, etc.
156    ///
157    /// `None` if the font has no OS/2 table.
158    #[serde(default)]
159    pub family_class: Option<(u8, u8)>,
160
161    /// Creator and provenance strings from the font's name table.
162    ///
163    /// Includes: copyright notice (name ID 0), trademark (7), manufacturer
164    /// (8), designer (9), description (10), vendor URL (11), designer URL
165    /// (12), license description (13), license URL (14).
166    ///
167    /// Useful for searching by foundry ("Adobe"), designer ("Matthew Carter"),
168    /// or license type ("OFL").
169    #[serde(default)]
170    pub creator_names: Vec<String>,
171
172    /// License-specific strings from the font's name table.
173    ///
174    /// A subset of creator info focused on licensing: copyright notice
175    /// (name ID 0), license description (13), license info URL (14).
176    ///
177    /// Useful for compliance checks: "show me all fonts with an SIL Open
178    /// Font License" or "find fonts with no license URL."
179    #[serde(default)]
180    pub license_names: Vec<String>,
181
182    /// PostScript Name (ID 6)
183    #[serde(default)]
184    pub psname: Option<String>,
185
186    /// Typographic Family Name (ID 16)
187    #[serde(default)]
188    pub tfname: Option<String>,
189
190    /// Legacy Family Name (ID 1)
191    #[serde(default)]
192    pub lfname: Option<String>,
193
194    /// Typographic Subfamily Name (ID 17)
195    #[serde(default)]
196    pub tsname: Option<String>,
197
198    /// Legacy Subfamily Name (ID 2)
199    #[serde(default)]
200    pub lsname: Option<String>,
201}
202
203/// Where a font face lives on disk.
204///
205/// For standalone `.ttf`/`.otf` files, the path is enough. For collection
206/// files (`.ttc`/`.otc`) that bundle multiple faces, the `ttc_index`
207/// identifies which face inside the collection this refers to.
208#[derive(Debug, Clone, Serialize, Deserialize)]
209pub struct TypgFontSource {
210    /// Filesystem path to the font file.
211    pub path: PathBuf,
212    /// Face index within a TTC/OTC collection file.
213    ///
214    /// `None` for single-face files (`.ttf`, `.otf`).
215    /// `Some(0)`, `Some(1)`, etc. for faces inside a collection.
216    /// For example, a `.ttc` containing "Arial" and "Arial Bold" would have
217    /// indices 0 and 1.
218    pub ttc_index: Option<u32>,
219}
220
221impl TypgFontSource {
222    /// Format as `path#index` for collection members, plain path otherwise.
223    ///
224    /// Examples: `/fonts/Noto.ttc#0`, `/fonts/Noto.ttc#1`, `/fonts/Arial.ttf`.
225    /// This notation is a common convention across font tools.
226    pub fn path_with_index(&self) -> String {
227        if let Some(idx) = self.ttc_index {
228            format!("{}#{idx}", self.path.display())
229        } else {
230            self.path.display().to_string()
231        }
232    }
233}
234
235/// A search result: one font face that matched the query.
236///
237/// Pairs the file location ([`TypgFontSource`]) with everything we extracted
238/// from the font's binary tables ([`TypgFontFaceMeta`]). This is the primary
239/// output type of the search engine — what you iterate over to display results,
240/// build caches, or pipe into downstream tools.
241#[derive(Debug, Clone, Serialize, Deserialize)]
242pub struct TypgFontFaceMatch {
243    /// Where the font lives: file path and optional TTC/OTC face index.
244    pub source: TypgFontSource,
245    /// Metadata extracted from the font's internal tables.
246    pub metadata: TypgFontFaceMeta,
247}
248
249/// Controls how the search engine runs: parallelism and traversal behavior.
250#[derive(Debug, Default, Clone)]
251pub struct SearchOptions {
252    /// Follow symbolic links when walking directories.
253    ///
254    /// Off by default to avoid infinite loops from circular symlinks.
255    /// Turn on when your font directories contain symlinks to real font
256    /// folders (common on macOS and Linux).
257    pub follow_symlinks: bool,
258
259    /// Number of parallel worker threads for font parsing.
260    ///
261    /// `None` (the default) uses all available CPU cores via rayon's
262    /// default thread pool. Set to `Some(1)` for single-threaded
263    /// operation (useful for debugging or constrained environments).
264    pub jobs: Option<usize>,
265}
266
267/// Search directories for fonts matching a query. The main entry point.
268///
269/// Walks the given directories, opens every font file found, extracts
270/// metadata, filters against the query, and returns all matches sorted by
271/// path (then by TTC index within each path).
272///
273/// This function collects all results in memory before returning — use
274/// [`search_streaming`] if you want results delivered as they're found
275/// (better for CLI output where users want to see progress immediately).
276///
277/// Corrupt or unreadable font files are silently skipped. The search
278/// never fails because of a single bad file.
279pub fn search(
280    paths: &[PathBuf],
281    query: &Query,
282    opts: &SearchOptions,
283) -> Result<Vec<TypgFontFaceMatch>> {
284    let discovery = PathDiscovery::new(paths.iter().cloned()).follow_symlinks(opts.follow_symlinks);
285    let candidates = discovery.discover()?;
286
287    let run_search = || -> Vec<TypgFontFaceMatch> {
288        let mut matches: Vec<TypgFontFaceMatch> = candidates
289            .par_iter()
290            .flat_map_iter(|loc| match load_metadata(&loc.path) {
291                Ok(faces) => faces,
292                Err(_) => Vec::new(),
293            })
294            .filter(|face| query.matches(&face.metadata))
295            .collect();
296
297        sort_matches(&mut matches);
298        matches
299    };
300
301    let matches = if let Some(jobs) = opts.jobs {
302        let pool = ThreadPoolBuilder::new().num_threads(jobs).build()?;
303        pool.install(run_search)
304    } else {
305        run_search()
306    };
307
308    Ok(matches)
309}
310
311/// Search directories and stream results as they're discovered.
312///
313/// Unlike [`search`], this doesn't wait until all fonts are processed. Each
314/// match is sent through the channel (`tx`) the moment it's found. Results
315/// arrive in arbitrary order — whichever thread finishes parsing a font
316/// first sends its matches first.
317///
318/// Use this for line-oriented output (plain text, paths, NDJSON) where the
319/// user benefits from seeing results immediately. The CLI's default output
320/// mode uses streaming so results start appearing while the scan is still
321/// running.
322///
323/// The sender (`tx`) is cloned across worker threads via rayon's
324/// `for_each_with`. When all threads finish, every clone is dropped, which
325/// closes the channel — the receiver knows the search is complete.
326///
327/// Corrupt or unreadable font files are silently skipped.
328pub fn search_streaming(
329    paths: &[PathBuf],
330    query: &Query,
331    opts: &SearchOptions,
332    tx: Sender<TypgFontFaceMatch>,
333) -> Result<()> {
334    let discovery = PathDiscovery::new(paths.iter().cloned()).follow_symlinks(opts.follow_symlinks);
335    let candidates = discovery.discover()?;
336
337    let run_search = || {
338        candidates
339            .par_iter()
340            .for_each_with(tx, |tx, loc| match load_metadata(&loc.path) {
341                Ok(faces) => {
342                    for face in faces {
343                        if query.matches(&face.metadata) {
344                            let _ = tx.send(face);
345                        }
346                    }
347                }
348                Err(_) => {}
349            });
350    };
351
352    if let Some(jobs) = opts.jobs {
353        let pool = ThreadPoolBuilder::new().num_threads(jobs).build()?;
354        pool.install(run_search);
355    } else {
356        run_search();
357    }
358
359    Ok(())
360}
361
362/// Filter pre-loaded font metadata against a query. No disk I/O.
363///
364/// Takes a slice of already-extracted font metadata (typically loaded from
365/// a JSON cache file) and returns only the entries matching the query.
366/// Results are sorted by path.
367///
368/// This is the fast path for repeated queries: pay the cost of scanning
369/// and extracting once (via [`search`] or the CLI's `cache add` command),
370/// save the results to a JSON file, then filter them in memory as many
371/// times as you like.
372pub fn filter_cached(entries: &[TypgFontFaceMatch], query: &Query) -> Vec<TypgFontFaceMatch> {
373    let mut matches: Vec<TypgFontFaceMatch> = entries
374        .iter()
375        .filter(|entry| query.matches(&entry.metadata))
376        .cloned()
377        .collect();
378
379    sort_matches(&mut matches);
380    matches
381}
382
383/// Read a font file and extract metadata for every face it contains.
384///
385/// A standalone `.ttf`/`.otf` file yields one `TypgFontFaceMatch`.
386/// A collection file (`.ttc`/`.otc`) yields one per face — a file with
387/// 12 faces produces 12 results.
388///
389/// Uses `read-fonts` for low-level table access (axes, features, scripts,
390/// tables, names, OS/2 classification) and `skrifa` for higher-level APIs
391/// (cmap/charmap iteration). Both crates come from Google's fontations
392/// project.
393fn load_metadata(path: &Path) -> Result<Vec<TypgFontFaceMatch>> {
394    let data = fs::read(path).with_context(|| format!("reading {}", path.display()))?;
395    let mut metas = Vec::new();
396
397    for font in FontRef::fonts(&data) {
398        let font = font?;
399        let ttc_index = font.ttc_index();
400        let sfont = if let Some(idx) = ttc_index {
401            SkrifaFontRef::from_index(&data, idx)?
402        } else {
403            SkrifaFontRef::new(&data)?
404        };
405
406        let (names, psname, tfname, lfname, tsname, lsname) = collect_names_and_details(&font);
407        let mut axis_tags = collect_axes(&font);
408        let mut feature_tags = collect_features(&font);
409        let mut script_tags = collect_scripts(&font);
410        let mut table_tags = collect_tables(&font);
411        let mut codepoints = collect_codepoints(&sfont);
412        let fvar_tag = Tag::new(b"fvar");
413        let is_variable = table_tags.contains(&fvar_tag);
414        let (weight_class, width_class, family_class) = collect_classification(&font);
415        let mut creator_names = collect_creator_names(&font);
416        let mut license_names = collect_license_names(&font);
417
418        dedup_tags(&mut axis_tags);
419        dedup_tags(&mut feature_tags);
420        dedup_tags(&mut script_tags);
421        dedup_tags(&mut table_tags);
422        dedup_codepoints(&mut codepoints);
423        creator_names.sort_unstable();
424        creator_names.dedup();
425        license_names.sort_unstable();
426        license_names.dedup();
427
428        metas.push(TypgFontFaceMatch {
429            source: TypgFontSource {
430                path: path.to_path_buf(),
431                ttc_index,
432            },
433            metadata: TypgFontFaceMeta {
434                names: dedup_names(names, path),
435                axis_tags,
436                feature_tags,
437                script_tags,
438                table_tags,
439                codepoints,
440                is_variable,
441                weight_class,
442                width_class,
443                family_class,
444                creator_names,
445                license_names,
446                psname,
447                tfname,
448                lfname,
449                tsname,
450                lsname,
451            },
452        });
453    }
454
455    Ok(metas)
456}
457
458/// List every top-level table tag in the font's table directory.
459///
460/// The table directory is the index at the start of every OpenType file.
461/// It maps four-character tags to byte offsets. Common tables: `glyf`
462/// (TrueType outlines), `CFF ` (PostScript outlines), `GSUB`, `GPOS`,
463/// `OS/2`, `name`, `cmap`, `head`, `fvar` (variable font axes).
464fn collect_tables(font: &FontRef) -> Vec<Tag> {
465    font.table_directory
466        .table_records()
467        .iter()
468        .map(|rec| rec.tag())
469        .collect()
470}
471
472/// Extract variation axis tags from the `fvar` table.
473///
474/// Returns an empty vec for static (non-variable) fonts. For variable fonts,
475/// returns tags like `wght`, `wdth`, `opsz`, `ital`, `slnt`, plus any
476/// custom axes the designer defined.
477fn collect_axes(font: &FontRef) -> Vec<Tag> {
478    if let Ok(fvar) = font.fvar() {
479        if let Ok(axes) = fvar.axes() {
480            return axes.iter().map(|axis| axis.axis_tag()).collect();
481        }
482    }
483    Vec::new()
484}
485
486/// Collect OpenType feature tags from GSUB and GPOS tables.
487///
488/// GSUB (glyph substitution) holds features like `liga` (ligatures), `smcp`
489/// (small caps), `calt` (contextual alternates). GPOS (glyph positioning)
490/// holds features like `kern` (kerning), `mark` (mark-to-base positioning).
491/// Both tables' feature lists are merged into one flat vec.
492fn collect_features(font: &FontRef) -> Vec<Tag> {
493    let mut tags = Vec::new();
494    if let Ok(gsub) = font.gsub() {
495        if let Ok(list) = gsub.feature_list() {
496            tags.extend(list.feature_records().iter().map(|rec| rec.feature_tag()));
497        }
498    }
499    if let Ok(gpos) = font.gpos() {
500        if let Ok(list) = gpos.feature_list() {
501            tags.extend(list.feature_records().iter().map(|rec| rec.feature_tag()));
502        }
503    }
504    tags
505}
506
507/// Collect script tags from GSUB and GPOS tables.
508///
509/// Script tags identify writing systems: `latn` (Latin), `arab` (Arabic),
510/// `cyrl` (Cyrillic), `hani` (CJK ideographs), `deva` (Devanagari).
511/// A font declares script support when it has shaping rules (lookups)
512/// specifically written for that script's typographic conventions.
513fn collect_scripts(font: &FontRef) -> Vec<Tag> {
514    let mut tags = Vec::new();
515    if let Ok(gsub) = font.gsub() {
516        if let Ok(list) = gsub.script_list() {
517            tags.extend(list.script_records().iter().map(|rec| rec.script_tag()));
518        }
519    }
520    if let Ok(gpos) = font.gpos() {
521        if let Ok(list) = gpos.script_list() {
522            tags.extend(list.script_records().iter().map(|rec| rec.script_tag()));
523        }
524    }
525    tags
526}
527
528/// Extract all Unicode codepoints from the font's `cmap` table.
529///
530/// The `cmap` (character map) maps Unicode codepoints to glyph IDs. If a
531/// codepoint appears here, the font has a glyph for it. We use `skrifa`'s
532/// `charmap().mappings()` iterator, which walks every (codepoint, glyph_id)
533/// pair in the font's best available cmap subtable.
534///
535/// Invalid Unicode scalar values (surrogates, out-of-range) are silently
536/// skipped via `char::from_u32`.
537fn collect_codepoints(font: &SkrifaFontRef) -> Vec<char> {
538    let mut cps = Vec::new();
539    for (cp, _) in font.charmap().mappings() {
540        if let Some(ch) = char::from_u32(cp) {
541            cps.push(ch);
542        }
543    }
544    cps
545}
546
547/// Extract identifying name strings and details from the font's `name` table.
548///
549/// The `name` table stores human-readable strings in multiple languages and
550/// encodings. We read only Unicode-encoded records for these name IDs:
551///
552/// - **Family Name** (ID 1): e.g., "Helvetica Neue"
553/// - **Typographic Family Name** (ID 16): preferred family grouping
554/// - **Subfamily Name** (ID 2): e.g., "Bold Italic"
555/// - **Typographic Subfamily Name** (ID 17): preferred style name
556/// - **Full Name** (ID 4): e.g., "Helvetica Neue Bold Italic"
557/// - **PostScript Name** (ID 6): e.g., "HelveticaNeue-BoldItalic"
558///
559/// Non-Unicode records (legacy Mac Roman, Windows symbol) are skipped.
560/// Empty or whitespace-only strings are discarded.
561fn collect_names_and_details(
562    font: &FontRef,
563) -> (
564    Vec<String>,
565    Option<String>,
566    Option<String>,
567    Option<String>,
568    Option<String>,
569    Option<String>,
570) {
571    let mut names = Vec::new();
572    let mut psname = None;
573    let mut tfname = None;
574    let mut lfname = None;
575    let mut tsname = None;
576    let mut lsname = None;
577
578    if let Ok(name_table) = font.name() {
579        let data = name_table.string_data();
580        let wanted = [
581            NameId::FAMILY_NAME,
582            NameId::TYPOGRAPHIC_FAMILY_NAME,
583            NameId::SUBFAMILY_NAME,
584            NameId::TYPOGRAPHIC_SUBFAMILY_NAME,
585            NameId::FULL_NAME,
586            NameId::POSTSCRIPT_NAME,
587        ];
588
589        for record in name_table.name_record() {
590            if !record.is_unicode() {
591                continue;
592            }
593            let name_id = record.name_id();
594            if !wanted.contains(&name_id) {
595                continue;
596            }
597            if let Ok(entry) = record.string(data) {
598                let rendered = entry.to_string();
599                let trimmed = rendered.trim().to_string();
600                if !trimmed.is_empty() {
601                    names.push(rendered);
602
603                    let val = Some(trimmed);
604                    if name_id == NameId::POSTSCRIPT_NAME && psname.is_none() {
605                        psname = val;
606                    } else if name_id == NameId::TYPOGRAPHIC_FAMILY_NAME && tfname.is_none() {
607                        tfname = val;
608                    } else if name_id == NameId::FAMILY_NAME && lfname.is_none() {
609                        lfname = val;
610                    } else if name_id == NameId::TYPOGRAPHIC_SUBFAMILY_NAME && tsname.is_none() {
611                        tsname = val;
612                    } else if name_id == NameId::SUBFAMILY_NAME && lsname.is_none() {
613                        lsname = val;
614                    }
615                }
616            }
617        }
618    }
619
620    (names, psname, tfname, lfname, tsname, lsname)
621}
622
623/// Extract creator and provenance strings from the `name` table.
624///
625/// Covers a broad range of attribution fields: copyright notice (ID 0),
626/// trademark (7), manufacturer/foundry (8), designer (9), description (10),
627/// vendor URL (11), designer URL (12), license description (13), license
628/// info URL (14). These let users search by foundry, designer, or license.
629fn collect_creator_names(font: &FontRef) -> Vec<String> {
630    let mut names = Vec::new();
631
632    if let Ok(name_table) = font.name() {
633        let data = name_table.string_data();
634        let wanted = [
635            NameId::COPYRIGHT_NOTICE,
636            NameId::TRADEMARK,
637            NameId::MANUFACTURER,
638            NameId::DESIGNER,
639            NameId::DESCRIPTION,
640            NameId::VENDOR_URL,
641            NameId::DESIGNER_URL,
642            NameId::LICENSE_DESCRIPTION,
643            NameId::LICENSE_URL,
644        ];
645
646        for record in name_table.name_record() {
647            if !record.is_unicode() {
648                continue;
649            }
650            if !wanted.contains(&record.name_id()) {
651                continue;
652            }
653            if let Ok(entry) = record.string(data) {
654                let rendered = entry.to_string();
655                if !rendered.trim().is_empty() {
656                    names.push(rendered);
657                }
658            }
659        }
660    }
661
662    names
663}
664
665/// Extract license-specific strings from the `name` table.
666///
667/// A focused subset of creator info: copyright (ID 0), license description
668/// (13), and license URL (14). Separated from the broader creator fields
669/// so callers can search specifically by license terms.
670fn collect_license_names(font: &FontRef) -> Vec<String> {
671    let mut names = Vec::new();
672
673    if let Ok(name_table) = font.name() {
674        let data = name_table.string_data();
675        let wanted = [
676            NameId::COPYRIGHT_NOTICE,
677            NameId::LICENSE_DESCRIPTION,
678            NameId::LICENSE_URL,
679        ];
680
681        for record in name_table.name_record() {
682            if !record.is_unicode() {
683                continue;
684            }
685            if !wanted.contains(&record.name_id()) {
686                continue;
687            }
688            if let Ok(entry) = record.string(data) {
689                let rendered = entry.to_string();
690                if !rendered.trim().is_empty() {
691                    names.push(rendered);
692                }
693            }
694        }
695    }
696
697    names
698}
699
700/// Extract weight class, width class, and family class from the OS/2 table.
701///
702/// The OS/2 table (named after IBM's OS/2 operating system — the name
703/// outlived the OS by decades) carries font classification metadata:
704///
705/// - `usWeightClass`: visual weight, 100 (Thin) to 900 (Black).
706/// - `usWidthClass`: visual width, 1 (UltraCondensed) to 9 (UltraExpanded).
707/// - `sFamilyClass`: a 16-bit value where the high byte is the major class
708///   (e.g., 8 = Sans Serif) and the low byte is the subclass (e.g., 1 =
709///   IBM Neo-Grotesque Gothic). We split it into `(major, subclass)`.
710///
711/// Returns `(None, None, None)` if the font lacks an OS/2 table (very rare
712/// in modern fonts, but possible in legacy or stripped files).
713fn collect_classification(font: &FontRef) -> (Option<u16>, Option<u16>, Option<(u8, u8)>) {
714    match font.os2() {
715        Ok(table) => {
716            let raw_family = table.s_family_class() as u16;
717            let class = (raw_family >> 8) as u8;
718            let subclass = (raw_family & 0x00FF) as u8;
719            (
720                Some(table.us_weight_class()),
721                Some(table.us_width_class()),
722                Some((class, subclass)),
723            )
724        }
725        Err(_) => (None, None, None),
726    }
727}
728
729/// Sort results by file path, then by TTC index within each file.
730/// Produces deterministic output regardless of thread scheduling order.
731fn sort_matches(matches: &mut [TypgFontFaceMatch]) {
732    matches.sort_by(|a, b| {
733        a.source
734            .path
735            .cmp(&b.source.path)
736            .then_with(|| a.source.ttc_index.cmp(&b.source.ttc_index))
737    });
738}
739
740fn dedup_tags(tags: &mut Vec<Tag>) {
741    tags.sort_unstable();
742    tags.dedup();
743}
744
745fn dedup_codepoints(codepoints: &mut Vec<char>) {
746    codepoints.sort_unstable();
747    codepoints.dedup();
748}
749
750/// Deduplicate name strings and add a filename-based fallback.
751///
752/// The file stem (e.g., "HelveticaNeue-Bold" from "HelveticaNeue-Bold.otf")
753/// is always appended. This ensures every font has at least one searchable
754/// name, even if its `name` table is empty or broken.
755fn dedup_names(mut names: Vec<String>, path: &Path) -> Vec<String> {
756    names.push(
757        path.file_stem()
758            .map(|s| s.to_string_lossy().to_string())
759            .unwrap_or_else(|| path.display().to_string()),
760    );
761
762    for name in names.iter_mut() {
763        *name = name.trim().to_string();
764    }
765
766    names.retain(|n| !n.is_empty());
767    names.sort_unstable();
768    names.dedup();
769    names
770}
771
772/// Serialize OpenType tags as human-readable strings in JSON.
773///
774/// Tags are stored as binary `Tag` values internally, but serialized as
775/// four-character strings (`"wght"`, `"liga"`) for readability in JSON
776/// output and cache files.
777fn serialize_tags<S>(tags: &[Tag], serializer: S) -> Result<S::Ok, S::Error>
778where
779    S: serde::Serializer,
780{
781    let as_strings: Vec<String> = tags.iter().copied().map(tag_to_string).collect();
782    as_strings.serialize(serializer)
783}
784
785/// Deserialize OpenType tags from their string representation back to `Tag` values.
786fn deserialize_tags<'de, D>(deserializer: D) -> Result<Vec<Tag>, D::Error>
787where
788    D: serde::Deserializer<'de>,
789{
790    let raw: Vec<String> = Vec::<String>::deserialize(deserializer)?;
791    raw.into_iter()
792        .map(|s| tag4(&s).map_err(serde::de::Error::custom))
793        .collect()
794}
795
796#[cfg(test)]
797mod tests {
798    use super::*;
799
800    #[test]
801    fn dedup_names_adds_fallback_and_trims() {
802        let names = vec!["  Alpha  ".to_string(), "Alpha".to_string()];
803        let path = Path::new("/fonts/Beta.ttf");
804        let deduped = dedup_names(names, path);
805
806        assert!(
807            deduped.contains(&"Alpha".to_string()),
808            "original names should be trimmed and kept"
809        );
810        assert!(
811            deduped.contains(&"Beta".to_string()),
812            "file stem should be added as fallback name"
813        );
814        assert_eq!(
815            deduped.len(),
816            2,
817            "dedup should remove duplicate entries and empty strings"
818        );
819    }
820
821    #[test]
822    fn dedup_tags_sorts_and_dedups() {
823        let mut tags = vec![
824            tag4("wght").unwrap(),
825            tag4("wght").unwrap(),
826            tag4("GSUB").unwrap(),
827        ];
828        dedup_tags(&mut tags);
829
830        assert_eq!(tags, vec![tag4("GSUB").unwrap(), tag4("wght").unwrap()]);
831    }
832
833    #[test]
834    fn dedup_codepoints_sorts_and_dedups() {
835        let mut cps = vec!['b', 'a', 'b'];
836        dedup_codepoints(&mut cps);
837        assert_eq!(cps, vec!['a', 'b']);
838    }
839}