unifont_bitmap/
lib.rs

1//! This crate incorporates the data for [GNU Unifont][1] in compressed binary
2//! form. It concerns itself with glyph lookup and caching only. It does not
3//! provide any rendering (like [`sdl2-unifont`][2] does) or even pixel lookup
4//! (like [`unifont`][3] does). It is nothing more than a compression scheme
5//! for the raw binary data represented in the `.hex` files that comprise
6//! GNU Unifont's "source code".
7//!
8//! [1]: http://unifoundry.com/unifont/index.html
9//! [2]: https://crates.io/crates/sdl2-unifont
10//! [3]: https://crates.io/crates/unifont
11//!
12//! # Background
13//!
14//! GNU Unifont is a bitmap font covering every character in Unicode. Narrow
15//! characters are 8x16 pixels, and wide characters are 16x16 pixels. GNU
16//! Unifont can be used to render any text that can be represented entirely
17//! without combining characters, ligatures, or other frippery. For example, it
18//! can render "ÿ", since that is encoded in Unicode as a single character:
19//!
20//! 1. `U+00FF LATIN SMALL LETTER Y WITH DIAERESIS` ("ÿ")
21//!
22//! But it could *not* render "ÿ̰́", which is a sequence of:
23//!
24//! 1. `U+0079 LATIN SMALL LETTER Y` ("y")
25//! 2. `U+0308 COMBINING DIAERESIS` ("◌̈")
26//! 3. `U+0301 COMBINING ACUTE ACCENT` ("◌́")
27//! 4. `U+0330 COMBINING TILDE BELOW` ("◌̰")
28//!
29//! In addition to basic concerns about putting pixels on the screen, any text
30//! rendering system may also have to account for [bidirectional text][4] (and
31//! right-to-left scripts in general) and take special care when [breaking
32//! lines of text][5]. Not to mention "invisible characters". All of these
33//! concerns are outside the scope of this crate, which, again, has the sole
34//! and simple purpose of retrieving the individual GNU Unifont glyph that
35//! represents a given Unicode code point.
36//!
37//! [4]: https://unicode.org/reports/tr9/
38//! [5]: https://unicode.org/reports/tr14/
39//!
40//! The font data is embedded in your executable, in compressed form. The whole
41//! thing is less than a megabyte in size when compressed, and if you somehow
42//! end up using every page, it adds about 2.3 megabytes of runtime memory
43//! overhead. This is a small price to pay for a font that covers every Unicode
44//! character.
45//!
46//! # Usage
47//!
48//! Single-threaded usage is simple, via the [`Unifont`](struct.Unifont.html)
49//! struct:
50//!
51//! ```rust
52//! use unifont_bitmap::Unifont;
53//! let mut unifont = Unifont::open();
54//! // Get a bitmap, loading its page if necessary. Requires mut.
55//! let my_bitmap = unifont.load_bitmap('井' as u32);
56//! println!("{} pixels wide.", if my_bitmap.is_wide() { 16 } else { 8 });
57//! println!("Bytes: {:?}", my_bitmap.get_bytes());
58//! // Get a bitmap, iff its page is already loaded. Does not require mut.
59//! let my_bitmap = unifont.get_bitmap('井' as u32).unwrap();
60//! println!("{} pixels wide.", if my_bitmap.is_wide() { 16 } else { 8 });
61//! println!("Bytes: {:?}", my_bitmap.get_bytes());
62//! ```
63//!
64//! What you do from here is complicated, and outside this crate's pay grade.
65//!
66//! # Legalese
67//!
68//! The `unifont-bitmap` crate is copyright 2021, Solra Bizna, and licensed
69//! under either of:
70//!
71//!  * Apache License, Version 2.0
72//!    ([LICENSE-APACHE](LICENSE-APACHE) or
73//!    <http://www.apache.org/licenses/LICENSE-2.0>)
74//!  * MIT license
75//!    ([LICENSE-MIT](LICENSE-MIT) or <http://opensource.org/licenses/MIT>)
76//!
77//! at your option.
78//!
79//! As for GNU Unifont:
80//!
81//! > Copyright (C) 1998-2021 Roman Czyborra, Paul Hardy, Qianqian Fang,
82//! > Andrew Miller, Johnnie Weaver, David Corbett, Nils Moskopp, Rebecca
83//! > Bettencourt, et al. License: SIL Open Font License version 1.1 and
84//! > GPLv2+: GNU GPL version 2 or later <http://gnu.org/licenses/gpl.html>
85//! > with the GNU Font Embedding Exception.
86//!
87//! I believe that this license is compatible with `unifont-bitmap`'s use of
88//! the font. If the font ends up statically linked into a non-GPL-compatible
89//! application, e.g. for its own use in UI elements, my interpretation of the
90//! license is that this is equivalent to embedding it into a document; thus
91//! explicitly permitted by the Font Embedding Exception. If one of the
92//! copyright holders and/or the Free Software Foundation disagrees with this
93//! interpretation, I'd be open to discuss the issue.
94//!
95//! ## Contribution
96//!
97//! Unless you explicitly state otherwise, any contribution intentionally
98//! submitted for inclusion in the `unifont-bitmap` crate by you, as defined
99//! in the Apache-2.0 license, shall be dual licensed as above, without any
100//! additional terms or conditions.
101
102use byteorder::{ReadBytesExt, BigEndian};
103
104const UNIFONT_DATA: &[u8] = include_bytes!("unifont.dat");
105
106/// The largest codepoint value that is, or ever will be, legal in Unicode.
107pub const MAX_UNICODE_CODEPOINT: u32 = 0x10FFFF;
108/// The number of legal codepoint values that exist in Unicode.
109pub const NUM_UNICODE_CODEPOINTS: u32 = MAX_UNICODE_CODEPOINT + 1;
110/// The largest number of a 256-codepoint "page" that exists in Unicode.
111pub const MAX_UNICODE_PAGE: u32 = NUM_UNICODE_PAGES-1;
112/// The number of 256-codepoint "pages" that exist in Unicode.
113pub const NUM_UNICODE_PAGES: u32 = NUM_UNICODE_CODEPOINTS >> 8;
114
115/// A single 8x16 or 16x16 bitmap, corresponding to a single displayed glyph.
116/// See the module documentation for a cryptic warning about combining
117/// characters, invisible characters, etc.
118#[derive(PartialEq,Eq)]
119pub struct Bitmap<'a> {
120    bytes: &'a [u8],
121}
122
123impl<'a> core::fmt::Debug for Bitmap<'a> {
124    fn fmt(&self, fmt: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
125	fmt.write_str("unifont bitmap {")?;
126	core::fmt::Debug::fmt(self.bytes, fmt)?;
127	fmt.write_str("}")
128    }
129}
130
131impl<'a> Bitmap<'a> {
132    /// Returns the bytes that make up the given bitmap. Each byte contains 8
133    /// pixels. The highest order bit of the byte is the leftmost pixel, the
134    /// next highest order bit is the next pixel, and so on. If the glyph is
135    /// wide (see `is_wide`) then there are two bytes per row, otherwise there
136    /// is one byte per row.
137    pub fn get_bytes(&self) -> &'a [u8] { self.bytes }
138    /// Returns `true` if the bitmap is wide (16x16), `false` if it is narrow
139    /// (8x16).
140    pub fn is_wide(&self) -> bool {
141	match self.bytes.len() {
142	    16 => false,
143	    32 => true,
144	    _ => unreachable!(),
145	}
146    }
147    /// Returns the dimensions of the bitmap, width then height.
148    /// Always returns (8,16) or (16,16).
149    pub fn get_dimensions<T: From<u8>>(&self) -> (T, T) {
150	match self.is_wide() {
151	    false => (8.into(), 16.into()),
152	    true => (16.into(), 16.into()),
153	}
154    }
155}
156
157#[derive(Default)]
158struct PageInfo {
159    uncompressed_size: u32,
160    compressed_offset: u32,
161    raw_data: Option<Vec<u8>>,
162}
163
164/// A data structure for caching Unifont character bitmaps. Decompresses the
165/// compressed font data in the executable on demand, and caches it in blocks
166/// ("pages") of 256 code points each.
167pub struct Unifont {
168    pages: [PageInfo; NUM_UNICODE_PAGES as usize],
169}
170
171impl Unifont {
172    /// Loads the Unifont bitmap corresponding to the given Unicode codepoint
173    /// (if necessary), and returns it.
174    ///
175    /// Will return the bitmap for U+FFFD REPLACEMENT CHAR (�) if Unifont does
176    /// not include a glyph for this bitmap.
177    ///
178    /// **PANICS** if you pass a `codepoint` larger than
179    /// `MAX_UNICODE_CODEPOINT`.
180    pub fn load_bitmap(&mut self, codepoint: u32) -> Bitmap {
181	assert!(codepoint <= MAX_UNICODE_CODEPOINT);
182	let page = codepoint >> 8;
183	self.load_page(page);
184	let ret = self.get_bitmap(codepoint);
185	// Justification for this unsafe block:
186	//
187	// Once loaded, the decompressed data for a given page will never be
188	// freed or moved until (and unless) this Unifont instance is dropped.
189	// Therefore, the implied lifetime constraint is met.
190	//
191	// A previous iteration of this API had a "purge_page" call, but that
192	// broke this safety assumption and was therefore removed.
193	if let Some(x) = ret { return unsafe { std::mem::transmute(x) } }
194	drop(ret);
195	if codepoint == 0xFFFD {
196	    panic!("U+FFFD should have been loaded but wasn't!");
197	}
198	else {
199	    // this will happen if U+FFFD was needed but not yet loaded
200	    self.load_bitmap(0xFFFD)
201	}
202    }
203    /// Gets the Unifont bitmap corresponding to the given Unicode codepoint,
204    /// if and only if it is already loaded.
205    ///
206    /// Will return the bitmap for `U+FFFD REPLACEMENT CHAR` (�) if Unifont
207    /// does not include a glyph for this bitmap, iff the respective page of
208    /// the font is already loaded.
209    ///
210    /// **PANICS** if you pass a `codepoint` larger than
211    /// `MAX_UNICODE_CODEPOINT`.
212    pub fn get_bitmap(&self, codepoint: u32) -> Option<Bitmap> {
213	assert!(codepoint <= MAX_UNICODE_CODEPOINT);
214	let page = codepoint >> 8;
215	let ch = codepoint & 255;
216	let raw_data = match self.pages[page as usize].raw_data.as_ref() {
217	    None => return None,
218	    Some(x) => &x[..],
219	};
220	let offset_offset = (ch as usize) * 2;
221	let char_offset =
222	    u16::from_ne_bytes(raw_data[offset_offset .. offset_offset + 2]
223			       .try_into().unwrap());
224	if char_offset == 0 {
225	    if codepoint == 0xFFFD {
226		panic!("U+FFFD should have been present but wasn't!");
227	    }
228	    else {
229		self.get_bitmap(0xFFFD)
230	    }
231	}
232	else {
233	    let is_wide = (char_offset & 1) != 0;
234	    let real_offset = (char_offset & !1) as usize;
235	    let region = &raw_data[real_offset .. real_offset +
236				   if is_wide { 32 } else { 16 }];
237	    Some(Bitmap { bytes: region })
238	}
239    }
240    /// Loads a given page, if it's not loaded already. (Since loading is
241    /// usually done transparently, this isn't usually needed.)
242    pub fn load_page(&mut self, page: u32) {
243	assert!(page <= MAX_UNICODE_PAGE);
244	let target_page = &mut self.pages[page as usize];
245	if target_page.raw_data.is_none() {
246	    if target_page.uncompressed_size == 0 {
247		target_page.raw_data = Some(vec![0u8; 512]);
248	    }
249	    else {
250		let mut inflater = flate2::Decompress::new(true);
251		let mut buf = vec![0; target_page.uncompressed_size as usize];
252		inflater.decompress(&UNIFONT_DATA[target_page.compressed_offset as usize ..], &mut buf[..], flate2::FlushDecompress::Finish).expect("The Unifont bitmap data in this application appears to be corrupted!");
253		let mut running_offset = 512u16;
254		for n in 0 .. 256 {
255		    let i = (n * 2) as usize;
256		    let in_offset = u16::from_be_bytes(buf[i..i+2].try_into().unwrap());
257		    let out_offset;
258		    match in_offset {
259			0x0000 => {
260			    // narrow char,
261			    out_offset = running_offset;
262			    running_offset += 16;
263			},
264			0x0001 => {
265			    // wide char
266			    out_offset = running_offset | 1;
267			    running_offset += 32;
268			},
269			0x0101 => {
270			    // invalid char
271			    out_offset = 0;
272			},
273			_ => {
274			    panic!("The Unifont bitmap data in this application appears to be corrupted!");
275			},
276		    }
277		    buf[i..i+2].copy_from_slice(&out_offset.to_ne_bytes());
278		}
279		target_page.raw_data = Some(buf)
280	    }
281	}
282    }
283    /// Creates a new instance of this class, with no glyphs cached yet.
284    ///
285    /// The font data is embedded in your executable, and does not need to be
286    /// provided any other way.
287    pub fn open() -> Unifont {
288	// oh boy, this pain point hasn't been resolved yet
289	let mut pages: [std::mem::MaybeUninit<PageInfo>;
290			NUM_UNICODE_PAGES as usize]
291	    = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
292	for el in &mut pages[..] {
293	    unsafe { std::ptr::write(el.as_mut_ptr(), PageInfo {
294		compressed_offset: 0, uncompressed_size: 0, raw_data: None
295	    }) }
296	}
297	let mut ret = Unifont { pages: unsafe { std::mem::transmute(pages) } };
298	ret.populate_page_infos();
299	ret
300    }
301    fn populate_page_infos(&mut self) {
302	let mut input = UNIFONT_DATA;
303	let start_offset: u32
304	    = input.read_u32::<BigEndian>().unwrap() + 4;
305	let mut running_offset = start_offset;
306	let mut buf = [0u8; NUM_UNICODE_PAGES as usize * 4];
307	let mut fish = flate2::Decompress::new(true);
308	fish.decompress(&UNIFONT_DATA[4..(running_offset as usize)],
309			&mut buf, flate2::FlushDecompress::Finish).unwrap();
310	let mut i = &buf[..];
311	for el in &mut self.pages[..] {
312	    let uncompressed_size = i.read_u16::<BigEndian>().unwrap();
313	    let compressed_size = i.read_u16::<BigEndian>().unwrap();
314	    el.uncompressed_size = uncompressed_size as u32;
315	    if el.uncompressed_size > 0 {
316		el.compressed_offset = running_offset;
317		running_offset += compressed_size as u32;
318	    }
319	    else {
320		el.compressed_offset = 0;
321	    }
322	}
323    }
324}
325
326#[cfg(test)]
327mod test {
328    use super::*;
329    #[test]
330    fn bogus_page() {
331	let mut unifont = Unifont::open();
332	let fffd = unifont.load_bitmap(0xFFFD);
333	drop(fffd);
334	let bad = unifont.load_bitmap(0x104560);
335	drop(bad);
336	let fffd = unifont.get_bitmap(0xFFFD);
337	let bad = unifont.get_bitmap(0x104560);
338	assert_eq!(fffd, bad);
339    }
340}