typg_core/search.rs
1//! Font metadata extraction and search.
2//!
3//! This module opens font files, extracts searchable metadata from their
4//! OpenType tables, and evaluates that metadata against a [`Query`]. The same
5//! metadata model is used by live scans, cache files, and indexed search.
6//!
7//! One file may yield multiple results because collection formats such as TTC
8//! and OTC can store several faces in a single container.
9//!
10//! Made by FontLab <https://www.fontlab.com/>
11use std::fs;
12use std::path::{Path, PathBuf};
13use std::sync::mpsc::Sender;
14
15use anyhow::{Context, Result};
16use rayon::prelude::*;
17use rayon::ThreadPoolBuilder;
18use read_fonts::tables::name::NameId;
19use read_fonts::types::Tag;
20use read_fonts::{FontRef, TableProvider};
21use serde::{Deserialize, Serialize};
22use skrifa::{FontRef as SkrifaFontRef, MetadataProvider};
23
24use crate::discovery::{FontDiscovery, PathDiscovery};
25use crate::query::Query;
26use crate::tags::{tag4, tag_to_string};
27
28/// Everything we know about a single font face, extracted from its binary tables.
29///
30/// One font *file* may contain multiple faces (in a TTC/OTC collection), and
31/// each face gets its own `TypgFontFaceMeta`. This struct is the unit of
32/// comparison — every query filter is evaluated against one of these.
33///
34/// All tag vectors are sorted and deduplicated after extraction, so you can
35/// safely use set-intersection logic against them.
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct TypgFontFaceMeta {
38 /// Human-readable names for this font face.
39 ///
40 /// Collected from the font's `name` table: family name ("Helvetica"),
41 /// typographic family name ("Helvetica Neue"), full name ("Helvetica Neue
42 /// Bold"), PostScript name ("HelveticaNeue-Bold"), and subfamily
43 /// ("Bold"). The file stem (e.g., "HelveticaNeue-Bold" from the filename)
44 /// is always appended as a fallback, because some fonts have empty or
45 /// broken name tables.
46 ///
47 /// Sorted, deduplicated, trimmed of whitespace.
48 pub names: Vec<String>,
49
50 /// Variation axis tags. Empty for static (non-variable) fonts.
51 ///
52 /// Common axes: `wght` (weight: 100=Thin, 400=Regular, 700=Bold, 900=Black),
53 /// `wdth` (width: 75=Condensed, 100=Normal, 125=Expanded),
54 /// `opsz` (optical size: adjusts stroke contrast for small/large rendering),
55 /// `ital` (italic: 0=Upright, 1=Italic),
56 /// `slnt` (slant: oblique angle in degrees).
57 ///
58 /// Read from the font's `fvar` (font variations) table.
59 #[serde(
60 serialize_with = "serialize_tags",
61 deserialize_with = "deserialize_tags"
62 )]
63 pub axis_tags: Vec<Tag>,
64
65 /// OpenType layout feature tags from GSUB and GPOS tables.
66 ///
67 /// These control typographic behavior: `liga` (standard ligatures — fi, fl
68 /// become single glyphs), `smcp` (small capitals), `onum` (oldstyle
69 /// numerals), `kern` (kerning — fine-tuned spacing between specific letter
70 /// pairs), `calt` (contextual alternates), `dlig` (discretionary ligatures).
71 ///
72 /// GSUB features handle glyph *substitution* (replacing one glyph with
73 /// another). GPOS features handle glyph *positioning* (adjusting placement).
74 /// Both are merged here because the query doesn't distinguish them.
75 #[serde(
76 serialize_with = "serialize_tags",
77 deserialize_with = "deserialize_tags"
78 )]
79 pub feature_tags: Vec<Tag>,
80
81 /// Script tags declaring which writing systems this font supports.
82 ///
83 /// Read from GSUB and GPOS script lists. Common values: `latn` (Latin),
84 /// `arab` (Arabic), `cyrl` (Cyrillic), `grek` (Greek), `hani` (CJK
85 /// ideographs), `deva` (Devanagari), `thai` (Thai).
86 ///
87 /// A font can render characters from a script's Unicode range without
88 /// declaring script support here — the script tag means the font has
89 /// *shaping rules* (substitutions, positioning) specifically for that
90 /// writing system.
91 #[serde(
92 serialize_with = "serialize_tags",
93 deserialize_with = "deserialize_tags"
94 )]
95 pub script_tags: Vec<Tag>,
96
97 /// Every top-level table present in the font file.
98 ///
99 /// Useful for structural queries: does this font have `CFF ` (PostScript
100 /// outlines) or `glyf` (TrueType outlines)? Does it have `SVG ` (color
101 /// SVG glyphs) or `COLR` (color layer glyphs)? Does it have `fvar`
102 /// (variable font axes)?
103 ///
104 /// Read directly from the font's table directory — the index at the
105 /// start of every OpenType file.
106 #[serde(
107 serialize_with = "serialize_tags",
108 deserialize_with = "deserialize_tags"
109 )]
110 pub table_tags: Vec<Tag>,
111
112 /// Unicode codepoints this font can render, from its `cmap` table.
113 ///
114 /// The `cmap` (character map) is the font's promise: "give me this
115 /// Unicode codepoint, I'll give you a glyph." If U+00F1 (ñ) is in
116 /// this list, the font has a glyph for it.
117 ///
118 /// Sorted and deduplicated. Can be large — a CJK font may cover
119 /// 20,000+ codepoints.
120 pub codepoints: Vec<char>,
121
122 /// Whether this font has an `fvar` table, making it a variable font.
123 ///
124 /// Variable fonts contain continuous design axes (weight, width, etc.)
125 /// instead of discrete named instances. A single variable font file can
126 /// replace an entire family of static fonts.
127 pub is_variable: bool,
128
129 /// OS/2 `usWeightClass` value. Indicates visual weight on a 1–1000 scale.
130 ///
131 /// Standard values: 100=Thin, 200=ExtraLight, 300=Light, 400=Regular,
132 /// 500=Medium, 600=SemiBold, 700=Bold, 800=ExtraBold, 900=Black.
133 /// `None` if the font has no OS/2 table (rare in modern fonts).
134 #[serde(default)]
135 pub weight_class: Option<u16>,
136
137 /// OS/2 `usWidthClass` value. Indicates visual width on a 1–9 scale.
138 ///
139 /// Values: 1=UltraCondensed, 2=ExtraCondensed, 3=Condensed,
140 /// 4=SemiCondensed, 5=Normal, 6=SemiExpanded, 7=Expanded,
141 /// 8=ExtraExpanded, 9=UltraExpanded.
142 /// `None` if the font has no OS/2 table.
143 #[serde(default)]
144 pub width_class: Option<u16>,
145
146 /// OS/2 `sFamilyClass` split into (major class, subclass).
147 ///
148 /// The major class groups fonts by general style: 0=No classification,
149 /// 1=Oldstyle Serifs, 2=Transitional Serifs, 3=Modern Serifs,
150 /// 4=Clarendon Serifs, 5=Slab Serifs, 7=Freeform Serifs,
151 /// 8=Sans Serif, 9=Ornamentals, 10=Scripts, 12=Symbolic.
152 ///
153 /// The subclass provides finer detail within each major class.
154 /// For example, within Sans Serif (8): 1=IBM Neo-Grotesque Gothic,
155 /// 2=Humanist, 3=Low-x Round Geometric, etc.
156 ///
157 /// `None` if the font has no OS/2 table.
158 #[serde(default)]
159 pub family_class: Option<(u8, u8)>,
160
161 /// Creator and provenance strings from the font's name table.
162 ///
163 /// Includes: copyright notice (name ID 0), trademark (7), manufacturer
164 /// (8), designer (9), description (10), vendor URL (11), designer URL
165 /// (12), license description (13), license URL (14).
166 ///
167 /// Useful for searching by foundry ("Adobe"), designer ("Matthew Carter"),
168 /// or license type ("OFL").
169 #[serde(default)]
170 pub creator_names: Vec<String>,
171
172 /// License-specific strings from the font's name table.
173 ///
174 /// A subset of creator info focused on licensing: copyright notice
175 /// (name ID 0), license description (13), license info URL (14).
176 ///
177 /// Useful for compliance checks: "show me all fonts with an SIL Open
178 /// Font License" or "find fonts with no license URL."
179 #[serde(default)]
180 pub license_names: Vec<String>,
181
182 /// PostScript Name (ID 6)
183 #[serde(default)]
184 pub psname: Option<String>,
185
186 /// Typographic Family Name (ID 16)
187 #[serde(default)]
188 pub tfname: Option<String>,
189
190 /// Legacy Family Name (ID 1)
191 #[serde(default)]
192 pub lfname: Option<String>,
193
194 /// Typographic Subfamily Name (ID 17)
195 #[serde(default)]
196 pub tsname: Option<String>,
197
198 /// Legacy Subfamily Name (ID 2)
199 #[serde(default)]
200 pub lsname: Option<String>,
201}
202
203/// Where a font face lives on disk.
204///
205/// For standalone `.ttf`/`.otf` files, the path is enough. For collection
206/// files (`.ttc`/`.otc`) that bundle multiple faces, the `ttc_index`
207/// identifies which face inside the collection this refers to.
208#[derive(Debug, Clone, Serialize, Deserialize)]
209pub struct TypgFontSource {
210 /// Filesystem path to the font file.
211 pub path: PathBuf,
212 /// Face index within a TTC/OTC collection file.
213 ///
214 /// `None` for single-face files (`.ttf`, `.otf`).
215 /// `Some(0)`, `Some(1)`, etc. for faces inside a collection.
216 /// For example, a `.ttc` containing "Arial" and "Arial Bold" would have
217 /// indices 0 and 1.
218 pub ttc_index: Option<u32>,
219}
220
221impl TypgFontSource {
222 /// Format as `path#index` for collection members, plain path otherwise.
223 ///
224 /// Examples: `/fonts/Noto.ttc#0`, `/fonts/Noto.ttc#1`, `/fonts/Arial.ttf`.
225 /// This notation is a common convention across font tools.
226 pub fn path_with_index(&self) -> String {
227 if let Some(idx) = self.ttc_index {
228 format!("{}#{idx}", self.path.display())
229 } else {
230 self.path.display().to_string()
231 }
232 }
233}
234
235/// A search result: one font face that matched the query.
236///
237/// Pairs the file location ([`TypgFontSource`]) with everything we extracted
238/// from the font's binary tables ([`TypgFontFaceMeta`]). This is the primary
239/// output type of the search engine — what you iterate over to display results,
240/// build caches, or pipe into downstream tools.
241#[derive(Debug, Clone, Serialize, Deserialize)]
242pub struct TypgFontFaceMatch {
243 /// Where the font lives: file path and optional TTC/OTC face index.
244 pub source: TypgFontSource,
245 /// Metadata extracted from the font's internal tables.
246 pub metadata: TypgFontFaceMeta,
247}
248
249/// Controls how the search engine runs: parallelism and traversal behavior.
250#[derive(Debug, Default, Clone)]
251pub struct SearchOptions {
252 /// Follow symbolic links when walking directories.
253 ///
254 /// Off by default to avoid infinite loops from circular symlinks.
255 /// Turn on when your font directories contain symlinks to real font
256 /// folders (common on macOS and Linux).
257 pub follow_symlinks: bool,
258
259 /// Number of parallel worker threads for font parsing.
260 ///
261 /// `None` (the default) uses all available CPU cores via rayon's
262 /// default thread pool. Set to `Some(1)` for single-threaded
263 /// operation (useful for debugging or constrained environments).
264 pub jobs: Option<usize>,
265}
266
267/// Search directories for fonts matching a query. The main entry point.
268///
269/// Walks the given directories, opens every font file found, extracts
270/// metadata, filters against the query, and returns all matches sorted by
271/// path (then by TTC index within each path).
272///
273/// This function collects all results in memory before returning — use
274/// [`search_streaming`] if you want results delivered as they're found
275/// (better for CLI output where users want to see progress immediately).
276///
277/// Corrupt or unreadable font files are silently skipped. The search
278/// never fails because of a single bad file.
279pub fn search(
280 paths: &[PathBuf],
281 query: &Query,
282 opts: &SearchOptions,
283) -> Result<Vec<TypgFontFaceMatch>> {
284 let discovery = PathDiscovery::new(paths.iter().cloned()).follow_symlinks(opts.follow_symlinks);
285 let candidates = discovery.discover()?;
286
287 let run_search = || -> Vec<TypgFontFaceMatch> {
288 let mut matches: Vec<TypgFontFaceMatch> = candidates
289 .par_iter()
290 .flat_map_iter(|loc| load_metadata(&loc.path).unwrap_or_default())
291 .filter(|face| query.matches(&face.metadata))
292 .collect();
293
294 sort_matches(&mut matches);
295 matches
296 };
297
298 let matches = if let Some(jobs) = opts.jobs {
299 let pool = ThreadPoolBuilder::new().num_threads(jobs).build()?;
300 pool.install(run_search)
301 } else {
302 run_search()
303 };
304
305 Ok(matches)
306}
307
308/// Search directories and stream results as they're discovered.
309///
310/// Unlike [`search`], this doesn't wait until all fonts are processed. Each
311/// match is sent through the channel (`tx`) the moment it's found. Results
312/// arrive in arbitrary order — whichever thread finishes parsing a font
313/// first sends its matches first.
314///
315/// Use this for line-oriented output (plain text, paths, NDJSON) where the
316/// user benefits from seeing results immediately. The CLI's default output
317/// mode uses streaming so results start appearing while the scan is still
318/// running.
319///
320/// The sender (`tx`) is cloned across worker threads via rayon's
321/// `for_each_with`. When all threads finish, every clone is dropped, which
322/// closes the channel — the receiver knows the search is complete.
323///
324/// Corrupt or unreadable font files are silently skipped.
325pub fn search_streaming(
326 paths: &[PathBuf],
327 query: &Query,
328 opts: &SearchOptions,
329 tx: Sender<TypgFontFaceMatch>,
330) -> Result<()> {
331 let discovery = PathDiscovery::new(paths.iter().cloned()).follow_symlinks(opts.follow_symlinks);
332 let candidates = discovery.discover()?;
333
334 let run_search = || {
335 candidates.par_iter().for_each_with(tx, |tx, loc| {
336 if let Ok(faces) = load_metadata(&loc.path) {
337 for face in faces {
338 if query.matches(&face.metadata) {
339 let _ = tx.send(face);
340 }
341 }
342 }
343 });
344 };
345
346 if let Some(jobs) = opts.jobs {
347 let pool = ThreadPoolBuilder::new().num_threads(jobs).build()?;
348 pool.install(run_search);
349 } else {
350 run_search();
351 }
352
353 Ok(())
354}
355
356/// Filter pre-loaded font metadata against a query. No disk I/O.
357///
358/// Takes a slice of already-extracted font metadata (typically loaded from
359/// a JSON cache file) and returns only the entries matching the query.
360/// Results are sorted by path.
361///
362/// This is the fast path for repeated queries: pay the cost of scanning
363/// and extracting once (via [`search`] or the CLI's `cache add` command),
364/// save the results to a JSON file, then filter them in memory as many
365/// times as you like.
366pub fn filter_cached(entries: &[TypgFontFaceMatch], query: &Query) -> Vec<TypgFontFaceMatch> {
367 let mut matches: Vec<TypgFontFaceMatch> = entries
368 .iter()
369 .filter(|entry| query.matches(&entry.metadata))
370 .cloned()
371 .collect();
372
373 sort_matches(&mut matches);
374 matches
375}
376
377/// Read a font file and extract metadata for every face it contains.
378///
379/// A standalone `.ttf`/`.otf` file yields one `TypgFontFaceMatch`.
380/// A collection file (`.ttc`/`.otc`) yields one per face — a file with
381/// 12 faces produces 12 results.
382///
383/// Uses `read-fonts` for low-level table access (axes, features, scripts,
384/// tables, names, OS/2 classification) and `skrifa` for higher-level APIs
385/// (cmap/charmap iteration). Both crates come from Google's fontations
386/// project.
387fn load_metadata(path: &Path) -> Result<Vec<TypgFontFaceMatch>> {
388 let data = fs::read(path).with_context(|| format!("reading {}", path.display()))?;
389 let mut metas = Vec::new();
390
391 for font in FontRef::fonts(&data) {
392 let font = font?;
393 let ttc_index = font.ttc_index();
394 let sfont = if let Some(idx) = ttc_index {
395 SkrifaFontRef::from_index(&data, idx)?
396 } else {
397 SkrifaFontRef::new(&data)?
398 };
399
400 let (names, psname, tfname, lfname, tsname, lsname) = collect_names_and_details(&font);
401 let mut axis_tags = collect_axes(&font);
402 let mut feature_tags = collect_features(&font);
403 let mut script_tags = collect_scripts(&font);
404 let mut table_tags = collect_tables(&font);
405 let mut codepoints = collect_codepoints(&sfont);
406 let fvar_tag = Tag::new(b"fvar");
407 let is_variable = table_tags.contains(&fvar_tag);
408 let (weight_class, width_class, family_class) = collect_classification(&font);
409 let mut creator_names = collect_creator_names(&font);
410 let mut license_names = collect_license_names(&font);
411
412 dedup_tags(&mut axis_tags);
413 dedup_tags(&mut feature_tags);
414 dedup_tags(&mut script_tags);
415 dedup_tags(&mut table_tags);
416 dedup_codepoints(&mut codepoints);
417 creator_names.sort_unstable();
418 creator_names.dedup();
419 license_names.sort_unstable();
420 license_names.dedup();
421
422 metas.push(TypgFontFaceMatch {
423 source: TypgFontSource {
424 path: path.to_path_buf(),
425 ttc_index,
426 },
427 metadata: TypgFontFaceMeta {
428 names: dedup_names(names, path),
429 axis_tags,
430 feature_tags,
431 script_tags,
432 table_tags,
433 codepoints,
434 is_variable,
435 weight_class,
436 width_class,
437 family_class,
438 creator_names,
439 license_names,
440 psname,
441 tfname,
442 lfname,
443 tsname,
444 lsname,
445 },
446 });
447 }
448
449 Ok(metas)
450}
451
452/// List every top-level table tag in the font's table directory.
453///
454/// The table directory is the index at the start of every OpenType file.
455/// It maps four-character tags to byte offsets. Common tables: `glyf`
456/// (TrueType outlines), `CFF ` (PostScript outlines), `GSUB`, `GPOS`,
457/// `OS/2`, `name`, `cmap`, `head`, `fvar` (variable font axes).
458fn collect_tables(font: &FontRef) -> Vec<Tag> {
459 font.table_directory
460 .table_records()
461 .iter()
462 .map(|rec| rec.tag())
463 .collect()
464}
465
466/// Extract variation axis tags from the `fvar` table.
467///
468/// Returns an empty vec for static (non-variable) fonts. For variable fonts,
469/// returns tags like `wght`, `wdth`, `opsz`, `ital`, `slnt`, plus any
470/// custom axes the designer defined.
471fn collect_axes(font: &FontRef) -> Vec<Tag> {
472 if let Ok(fvar) = font.fvar() {
473 if let Ok(axes) = fvar.axes() {
474 return axes.iter().map(|axis| axis.axis_tag()).collect();
475 }
476 }
477 Vec::new()
478}
479
480/// Collect OpenType feature tags from GSUB and GPOS tables.
481///
482/// GSUB (glyph substitution) holds features like `liga` (ligatures), `smcp`
483/// (small caps), `calt` (contextual alternates). GPOS (glyph positioning)
484/// holds features like `kern` (kerning), `mark` (mark-to-base positioning).
485/// Both tables' feature lists are merged into one flat vec.
486fn collect_features(font: &FontRef) -> Vec<Tag> {
487 let mut tags = Vec::new();
488 if let Ok(gsub) = font.gsub() {
489 if let Ok(list) = gsub.feature_list() {
490 tags.extend(list.feature_records().iter().map(|rec| rec.feature_tag()));
491 }
492 }
493 if let Ok(gpos) = font.gpos() {
494 if let Ok(list) = gpos.feature_list() {
495 tags.extend(list.feature_records().iter().map(|rec| rec.feature_tag()));
496 }
497 }
498 tags
499}
500
501/// Collect script tags from GSUB and GPOS tables.
502///
503/// Script tags identify writing systems: `latn` (Latin), `arab` (Arabic),
504/// `cyrl` (Cyrillic), `hani` (CJK ideographs), `deva` (Devanagari).
505/// A font declares script support when it has shaping rules (lookups)
506/// specifically written for that script's typographic conventions.
507fn collect_scripts(font: &FontRef) -> Vec<Tag> {
508 let mut tags = Vec::new();
509 if let Ok(gsub) = font.gsub() {
510 if let Ok(list) = gsub.script_list() {
511 tags.extend(list.script_records().iter().map(|rec| rec.script_tag()));
512 }
513 }
514 if let Ok(gpos) = font.gpos() {
515 if let Ok(list) = gpos.script_list() {
516 tags.extend(list.script_records().iter().map(|rec| rec.script_tag()));
517 }
518 }
519 tags
520}
521
522/// Extract all Unicode codepoints from the font's `cmap` table.
523///
524/// The `cmap` (character map) maps Unicode codepoints to glyph IDs. If a
525/// codepoint appears here, the font has a glyph for it. We use `skrifa`'s
526/// `charmap().mappings()` iterator, which walks every (codepoint, glyph_id)
527/// pair in the font's best available cmap subtable.
528///
529/// Invalid Unicode scalar values (surrogates, out-of-range) are silently
530/// skipped via `char::from_u32`.
531fn collect_codepoints(font: &SkrifaFontRef) -> Vec<char> {
532 let mut cps = Vec::new();
533 for (cp, _) in font.charmap().mappings() {
534 if let Some(ch) = char::from_u32(cp) {
535 cps.push(ch);
536 }
537 }
538 cps
539}
540
541/// Extract identifying name strings and details from the font's `name` table.
542///
543/// The `name` table stores human-readable strings in multiple languages and
544/// encodings. We read only Unicode-encoded records for these name IDs:
545///
546/// - **Family Name** (ID 1): e.g., "Helvetica Neue"
547/// - **Typographic Family Name** (ID 16): preferred family grouping
548/// - **Subfamily Name** (ID 2): e.g., "Bold Italic"
549/// - **Typographic Subfamily Name** (ID 17): preferred style name
550/// - **Full Name** (ID 4): e.g., "Helvetica Neue Bold Italic"
551/// - **PostScript Name** (ID 6): e.g., "HelveticaNeue-BoldItalic"
552///
553/// Non-Unicode records (legacy Mac Roman, Windows symbol) are skipped.
554/// Empty or whitespace-only strings are discarded.
555#[allow(clippy::type_complexity)]
556fn collect_names_and_details(
557 font: &FontRef,
558) -> (
559 Vec<String>,
560 Option<String>,
561 Option<String>,
562 Option<String>,
563 Option<String>,
564 Option<String>,
565) {
566 let mut names = Vec::new();
567 let mut psname = None;
568 let mut tfname = None;
569 let mut lfname = None;
570 let mut tsname = None;
571 let mut lsname = None;
572
573 if let Ok(name_table) = font.name() {
574 let data = name_table.string_data();
575 let wanted = [
576 NameId::FAMILY_NAME,
577 NameId::TYPOGRAPHIC_FAMILY_NAME,
578 NameId::SUBFAMILY_NAME,
579 NameId::TYPOGRAPHIC_SUBFAMILY_NAME,
580 NameId::FULL_NAME,
581 NameId::POSTSCRIPT_NAME,
582 ];
583
584 for record in name_table.name_record() {
585 if !record.is_unicode() {
586 continue;
587 }
588 let name_id = record.name_id();
589 if !wanted.contains(&name_id) {
590 continue;
591 }
592 if let Ok(entry) = record.string(data) {
593 let rendered = entry.to_string();
594 let trimmed = rendered.trim().to_string();
595 if !trimmed.is_empty() {
596 names.push(rendered);
597
598 let val = Some(trimmed);
599 if name_id == NameId::POSTSCRIPT_NAME && psname.is_none() {
600 psname = val;
601 } else if name_id == NameId::TYPOGRAPHIC_FAMILY_NAME && tfname.is_none() {
602 tfname = val;
603 } else if name_id == NameId::FAMILY_NAME && lfname.is_none() {
604 lfname = val;
605 } else if name_id == NameId::TYPOGRAPHIC_SUBFAMILY_NAME && tsname.is_none() {
606 tsname = val;
607 } else if name_id == NameId::SUBFAMILY_NAME && lsname.is_none() {
608 lsname = val;
609 }
610 }
611 }
612 }
613 }
614
615 (names, psname, tfname, lfname, tsname, lsname)
616}
617
618/// Extract creator and provenance strings from the `name` table.
619///
620/// Covers a broad range of attribution fields: copyright notice (ID 0),
621/// trademark (7), manufacturer/foundry (8), designer (9), description (10),
622/// vendor URL (11), designer URL (12), license description (13), license
623/// info URL (14). These let users search by foundry, designer, or license.
624fn collect_creator_names(font: &FontRef) -> Vec<String> {
625 let mut names = Vec::new();
626
627 if let Ok(name_table) = font.name() {
628 let data = name_table.string_data();
629 let wanted = [
630 NameId::COPYRIGHT_NOTICE,
631 NameId::TRADEMARK,
632 NameId::MANUFACTURER,
633 NameId::DESIGNER,
634 NameId::DESCRIPTION,
635 NameId::VENDOR_URL,
636 NameId::DESIGNER_URL,
637 NameId::LICENSE_DESCRIPTION,
638 NameId::LICENSE_URL,
639 ];
640
641 for record in name_table.name_record() {
642 if !record.is_unicode() {
643 continue;
644 }
645 if !wanted.contains(&record.name_id()) {
646 continue;
647 }
648 if let Ok(entry) = record.string(data) {
649 let rendered = entry.to_string();
650 if !rendered.trim().is_empty() {
651 names.push(rendered);
652 }
653 }
654 }
655 }
656
657 names
658}
659
660/// Extract license-specific strings from the `name` table.
661///
662/// A focused subset of creator info: copyright (ID 0), license description
663/// (13), and license URL (14). Separated from the broader creator fields
664/// so callers can search specifically by license terms.
665fn collect_license_names(font: &FontRef) -> Vec<String> {
666 let mut names = Vec::new();
667
668 if let Ok(name_table) = font.name() {
669 let data = name_table.string_data();
670 let wanted = [
671 NameId::COPYRIGHT_NOTICE,
672 NameId::LICENSE_DESCRIPTION,
673 NameId::LICENSE_URL,
674 ];
675
676 for record in name_table.name_record() {
677 if !record.is_unicode() {
678 continue;
679 }
680 if !wanted.contains(&record.name_id()) {
681 continue;
682 }
683 if let Ok(entry) = record.string(data) {
684 let rendered = entry.to_string();
685 if !rendered.trim().is_empty() {
686 names.push(rendered);
687 }
688 }
689 }
690 }
691
692 names
693}
694
695/// Extract weight class, width class, and family class from the OS/2 table.
696///
697/// The OS/2 table (named after IBM's OS/2 operating system — the name
698/// outlived the OS by decades) carries font classification metadata:
699///
700/// - `usWeightClass`: visual weight, 100 (Thin) to 900 (Black).
701/// - `usWidthClass`: visual width, 1 (UltraCondensed) to 9 (UltraExpanded).
702/// - `sFamilyClass`: a 16-bit value where the high byte is the major class
703/// (e.g., 8 = Sans Serif) and the low byte is the subclass (e.g., 1 =
704/// IBM Neo-Grotesque Gothic). We split it into `(major, subclass)`.
705///
706/// Returns `(None, None, None)` if the font lacks an OS/2 table (very rare
707/// in modern fonts, but possible in legacy or stripped files).
708fn collect_classification(font: &FontRef) -> (Option<u16>, Option<u16>, Option<(u8, u8)>) {
709 match font.os2() {
710 Ok(table) => {
711 let raw_family = table.s_family_class() as u16;
712 let class = (raw_family >> 8) as u8;
713 let subclass = (raw_family & 0x00FF) as u8;
714 (
715 Some(table.us_weight_class()),
716 Some(table.us_width_class()),
717 Some((class, subclass)),
718 )
719 }
720 Err(_) => (None, None, None),
721 }
722}
723
724/// Sort results by file path, then by TTC index within each file.
725/// Produces deterministic output regardless of thread scheduling order.
726fn sort_matches(matches: &mut [TypgFontFaceMatch]) {
727 matches.sort_by(|a, b| {
728 a.source
729 .path
730 .cmp(&b.source.path)
731 .then_with(|| a.source.ttc_index.cmp(&b.source.ttc_index))
732 });
733}
734
735fn dedup_tags(tags: &mut Vec<Tag>) {
736 tags.sort_unstable();
737 tags.dedup();
738}
739
740fn dedup_codepoints(codepoints: &mut Vec<char>) {
741 codepoints.sort_unstable();
742 codepoints.dedup();
743}
744
745/// Deduplicate name strings and add a filename-based fallback.
746///
747/// The file stem (e.g., "HelveticaNeue-Bold" from "HelveticaNeue-Bold.otf")
748/// is always appended. This ensures every font has at least one searchable
749/// name, even if its `name` table is empty or broken.
750fn dedup_names(mut names: Vec<String>, path: &Path) -> Vec<String> {
751 names.push(
752 path.file_stem()
753 .map(|s| s.to_string_lossy().to_string())
754 .unwrap_or_else(|| path.display().to_string()),
755 );
756
757 for name in names.iter_mut() {
758 *name = name.trim().to_string();
759 }
760
761 names.retain(|n| !n.is_empty());
762 names.sort_unstable();
763 names.dedup();
764 names
765}
766
767/// Serialize OpenType tags as human-readable strings in JSON.
768///
769/// Tags are stored as binary `Tag` values internally, but serialized as
770/// four-character strings (`"wght"`, `"liga"`) for readability in JSON
771/// output and cache files.
772fn serialize_tags<S>(tags: &[Tag], serializer: S) -> Result<S::Ok, S::Error>
773where
774 S: serde::Serializer,
775{
776 let as_strings: Vec<String> = tags.iter().copied().map(tag_to_string).collect();
777 as_strings.serialize(serializer)
778}
779
780/// Deserialize OpenType tags from their string representation back to `Tag` values.
781fn deserialize_tags<'de, D>(deserializer: D) -> Result<Vec<Tag>, D::Error>
782where
783 D: serde::Deserializer<'de>,
784{
785 let raw: Vec<String> = Vec::<String>::deserialize(deserializer)?;
786 raw.into_iter()
787 .map(|s| tag4(&s).map_err(serde::de::Error::custom))
788 .collect()
789}
790
791#[cfg(test)]
792mod tests {
793 use super::*;
794
795 #[test]
796 fn dedup_names_adds_fallback_and_trims() {
797 let names = vec![" Alpha ".to_string(), "Alpha".to_string()];
798 let path = Path::new("/fonts/Beta.ttf");
799 let deduped = dedup_names(names, path);
800
801 assert!(
802 deduped.contains(&"Alpha".to_string()),
803 "original names should be trimmed and kept"
804 );
805 assert!(
806 deduped.contains(&"Beta".to_string()),
807 "file stem should be added as fallback name"
808 );
809 assert_eq!(
810 deduped.len(),
811 2,
812 "dedup should remove duplicate entries and empty strings"
813 );
814 }
815
816 #[test]
817 fn dedup_tags_sorts_and_dedups() {
818 let mut tags = vec![
819 tag4("wght").unwrap(),
820 tag4("wght").unwrap(),
821 tag4("GSUB").unwrap(),
822 ];
823 dedup_tags(&mut tags);
824
825 assert_eq!(tags, vec![tag4("GSUB").unwrap(), tag4("wght").unwrap()]);
826 }
827
828 #[test]
829 fn dedup_codepoints_sorts_and_dedups() {
830 let mut cps = vec!['b', 'a', 'b'];
831 dedup_codepoints(&mut cps);
832 assert_eq!(cps, vec!['a', 'b']);
833 }
834}