Skip to main content

tor_geoip/
lib.rs

1//! A crate for performing GeoIP lookups using the Tor GeoIP database.
2
3// @@ begin lint list maintained by maint/add_warning @@
4#![allow(renamed_and_removed_lints)] // @@REMOVE_WHEN(ci_arti_stable)
5#![allow(unknown_lints)] // @@REMOVE_WHEN(ci_arti_nightly)
6#![warn(missing_docs)]
7#![warn(noop_method_call)]
8#![warn(unreachable_pub)]
9#![warn(clippy::all)]
10#![deny(clippy::await_holding_lock)]
11#![deny(clippy::cargo_common_metadata)]
12#![deny(clippy::cast_lossless)]
13#![deny(clippy::checked_conversions)]
14#![warn(clippy::cognitive_complexity)]
15#![deny(clippy::debug_assert_with_mut_call)]
16#![deny(clippy::exhaustive_enums)]
17#![deny(clippy::exhaustive_structs)]
18#![deny(clippy::expl_impl_clone_on_copy)]
19#![deny(clippy::fallible_impl_from)]
20#![deny(clippy::implicit_clone)]
21#![deny(clippy::large_stack_arrays)]
22#![warn(clippy::manual_ok_or)]
23#![deny(clippy::missing_docs_in_private_items)]
24#![warn(clippy::needless_borrow)]
25#![warn(clippy::needless_pass_by_value)]
26#![warn(clippy::option_option)]
27#![deny(clippy::print_stderr)]
28#![deny(clippy::print_stdout)]
29#![warn(clippy::rc_buffer)]
30#![deny(clippy::ref_option_ref)]
31#![warn(clippy::semicolon_if_nothing_returned)]
32#![warn(clippy::trait_duplication_in_bounds)]
33#![deny(clippy::unchecked_time_subtraction)]
34#![deny(clippy::unnecessary_wraps)]
35#![warn(clippy::unseparated_literal_suffix)]
36#![deny(clippy::unwrap_used)]
37#![deny(clippy::mod_module_files)]
38#![allow(clippy::let_unit_value)] // This can reasonably be done for explicitness
39#![allow(clippy::uninlined_format_args)]
40#![allow(clippy::significant_drop_in_scrutinee)] // arti/-/merge_requests/588/#note_2812945
41#![allow(clippy::result_large_err)] // temporary workaround for arti#587
42#![allow(clippy::needless_raw_string_hashes)] // complained-about code is fine, often best
43#![allow(clippy::needless_lifetimes)] // See arti#1765
44#![allow(mismatched_lifetime_syntaxes)] // temporary workaround for arti#2060
45#![deny(clippy::unused_async)]
46//! <!-- @@ end lint list maintained by maint/add_warning @@ -->
47
48// TODO #1645 (either remove this, or decide to have it everywhere)
49#![cfg_attr(not(all(feature = "full")), allow(unused))]
50
51pub use crate::err::Error;
52use rangemap::RangeInclusiveMap;
53use std::fmt::{Debug, Display, Formatter};
54use std::net::{IpAddr, Ipv6Addr};
55use std::num::{NonZeroU8, NonZeroU32, TryFromIntError};
56use std::str::FromStr;
57use std::sync::{Arc, OnceLock};
58
59mod err;
60
61/// An embedded copy of the latest geoip v4 database at the time of compilation.
62///
63/// FIXME(eta): This does use a few megabytes of binary size, which is less than ideal.
64///             It would be better to parse it at compile time or something.
65#[cfg(feature = "embedded-db")]
66static EMBEDDED_DB_V4: &str = include_str!("../data/geoip");
67
68/// An embedded copy of the latest geoip v6 database at the time of compilation.
69#[cfg(feature = "embedded-db")]
70static EMBEDDED_DB_V6: &str = include_str!("../data/geoip6");
71
72/// A parsed copy of the embedded database.
73#[cfg(feature = "embedded-db")]
74static EMBEDDED_DB_PARSED: OnceLock<Arc<GeoipDb>> = OnceLock::new();
75
76/// A two-letter country code.
77///
78/// Specifically, this type represents a purported "ISO 3166-1 alpha-2" country
79/// code, such as "IT" for Italy or "UY" for Uruguay.
80///
81/// It does not include the sentinel value `??` that we use to represent
82/// "country unknown"; if you need that, use [`OptionCc`]. Other than that, we
83/// do not check whether the country code represents a real country: we only
84/// ensure that it is a pair of printing ASCII characters.
85///
86/// Note that the geoip databases included with Arti will only include real
87/// countries; we do not include the pseudo-countries `A1` through `An` for
88/// "anonymous proxies", since doing so would mean putting nearly all Tor relays
89/// into one of those countries.
90#[derive(Copy, Clone, Eq, PartialEq)]
91pub struct CountryCode {
92    /// The underlying value (two printable ASCII characters, stored uppercase).
93    ///
94    /// The special value `??` is excluded, since it is not a country; use
95    /// `OptionCc` instead if you need to represent that.
96    ///
97    /// We store these as `NonZeroU8` so that an `Option<CountryCode>` only has to
98    /// take 2 bytes. This helps with alignment and storage.
99    inner: [NonZeroU8; 2],
100}
101
102impl CountryCode {
103    /// Make a new `CountryCode`.
104    fn new(cc_orig: &str) -> Result<Self, Error> {
105        /// Try to convert an array of 2 bytes into an array of 2 nonzero bytes.
106        #[inline]
107        fn try_cvt_to_nz(inp: [u8; 2]) -> Result<[NonZeroU8; 2], TryFromIntError> {
108            // I have confirmed that the asm here is reasonably efficient.
109            Ok([inp[0].try_into()?, inp[1].try_into()?])
110        }
111
112        let cc = cc_orig.to_ascii_uppercase();
113
114        let cc: [u8; 2] = cc
115            .as_bytes()
116            .try_into()
117            .map_err(|_| Error::BadCountryCode(cc))?;
118
119        if !cc.iter().all(|b| b.is_ascii() && !b.is_ascii_control()) {
120            return Err(Error::BadCountryCode(cc_orig.to_owned()));
121        }
122
123        if &cc == b"??" {
124            return Err(Error::NowhereNotSupported);
125        }
126
127        Ok(Self {
128            inner: try_cvt_to_nz(cc).map_err(|_| Error::BadCountryCode(cc_orig.to_owned()))?,
129        })
130    }
131
132    /// Get the actual country code.
133    ///
134    /// This just calls `.as_ref()`.
135    pub fn get(&self) -> &str {
136        self.as_ref()
137    }
138}
139
140impl Display for CountryCode {
141    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
142        write!(f, "{}", self.as_ref())
143    }
144}
145
146impl Debug for CountryCode {
147    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
148        write!(f, "CountryCode(\"{}\")", self.as_ref())
149    }
150}
151
152impl AsRef<str> for CountryCode {
153    fn as_ref(&self) -> &str {
154        /// Convert a reference to an array of 2 nonzero bytes to a reference to
155        /// an array of 2 bytes.
156        #[inline]
157        fn cvt_ref(inp: &[NonZeroU8; 2]) -> &[u8; 2] {
158            // SAFETY: Every NonZeroU8 has a layout and bit validity that is
159            // also a valid u8.  The layout of arrays is also guaranteed.
160            //
161            // (We don't use try_into here because we need to return a str that
162            // points to a reference to self.)
163            let ptr = inp.as_ptr() as *const u8;
164            let slice = unsafe { std::slice::from_raw_parts(ptr, inp.len()) };
165            slice
166                .try_into()
167                .expect("the resulting slice should have the correct length!")
168        }
169
170        // This shouldn't ever panic, since we shouldn't feed non-utf8 country
171        // codes in.
172        //
173        // In theory we could use from_utf8_unchecked, but that's probably not
174        // needed.
175        std::str::from_utf8(cvt_ref(&self.inner)).expect("invalid country code in CountryCode")
176    }
177}
178
179impl FromStr for CountryCode {
180    type Err = Error;
181
182    fn from_str(s: &str) -> Result<Self, Self::Err> {
183        CountryCode::new(s)
184    }
185}
186
187/// Wrapper for an `Option<`[`CountryCode`]`>` that encodes `None` as `??`.
188///
189/// Used so that we can implement foreign traits.
190#[derive(
191    Copy, Clone, Debug, Eq, PartialEq, derive_more::Into, derive_more::From, derive_more::AsRef,
192)]
193#[allow(clippy::exhaustive_structs)]
194pub struct OptionCc(pub Option<CountryCode>);
195
196impl FromStr for OptionCc {
197    type Err = Error;
198
199    fn from_str(s: &str) -> Result<Self, Self::Err> {
200        match CountryCode::new(s) {
201            Err(Error::NowhereNotSupported) => Ok(None.into()),
202            Err(e) => Err(e),
203            Ok(cc) => Ok(Some(cc).into()),
204        }
205    }
206}
207
208impl Display for OptionCc {
209    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
210        match self.0 {
211            Some(cc) => write!(f, "{}", cc),
212            None => write!(f, "??"),
213        }
214    }
215}
216
217/// A country code / ASN definition.
218///
219/// Type lifted from `geoip-db-tool` in the C-tor source.
220#[derive(Copy, Clone, Eq, PartialEq, Debug)]
221struct NetDefn {
222    /// The country code.
223    ///
224    /// We translate the value "??" into None.
225    cc: Option<CountryCode>,
226    /// The ASN, if we have one. We translate the value "0" into None.
227    asn: Option<NonZeroU32>,
228}
229
230impl NetDefn {
231    /// Make a new `NetDefn`.
232    fn new(cc: &str, asn: Option<u32>) -> Result<Self, Error> {
233        let asn = NonZeroU32::new(asn.unwrap_or(0));
234        let cc = cc.parse::<OptionCc>()?.into();
235
236        Ok(Self { cc, asn })
237    }
238
239    /// Return the country code.
240    fn country_code(&self) -> Option<&CountryCode> {
241        self.cc.as_ref()
242    }
243
244    /// Return the ASN, if there is one.
245    fn asn(&self) -> Option<u32> {
246        self.asn.as_ref().map(|x| x.get())
247    }
248}
249
250/// A database of IP addresses to country codes.
251#[derive(Clone, Eq, PartialEq, Debug)]
252pub struct GeoipDb {
253    /// The IPv4 subset of the database, with v4 addresses stored as 32-bit integers.
254    map_v4: RangeInclusiveMap<u32, NetDefn>,
255    /// The IPv6 subset of the database, with v6 addresses stored as 128-bit integers.
256    map_v6: RangeInclusiveMap<u128, NetDefn>,
257}
258
259impl GeoipDb {
260    /// Make a new `GeoipDb` using a compiled-in copy of the GeoIP database.
261    ///
262    /// The returned instance of the database is shared with `Arc` across all invocations of this
263    /// function in the same program.
264    #[cfg(feature = "embedded-db")]
265    pub fn new_embedded() -> Arc<Self> {
266        Arc::clone(EMBEDDED_DB_PARSED.get_or_init(|| {
267            Arc::new(
268                // It's reasonable to assume the one we embedded is fine -- we'll test it in CI, etc.
269                Self::new_from_legacy_format(EMBEDDED_DB_V4, EMBEDDED_DB_V6)
270                    .expect("failed to parse embedded geoip database"),
271            )
272        }))
273    }
274
275    /// Make a new `GeoipDb` using provided copies of the v4 and v6 database, in Tor legacy format.
276    pub fn new_from_legacy_format(db_v4: &str, db_v6: &str) -> Result<Self, Error> {
277        let mut ret = GeoipDb {
278            map_v4: Default::default(),
279            map_v6: Default::default(),
280        };
281
282        for line in db_v4.lines() {
283            if line.starts_with('#') {
284                continue;
285            }
286            let line = line.trim();
287            if line.is_empty() {
288                continue;
289            }
290            let mut split = line.split(',');
291            let from = split
292                .next()
293                .ok_or(Error::BadFormat("empty line somehow?"))?
294                .parse::<u32>()?;
295            let to = split
296                .next()
297                .ok_or(Error::BadFormat("line with insufficient commas"))?
298                .parse::<u32>()?;
299            let cc = split
300                .next()
301                .ok_or(Error::BadFormat("line with insufficient commas"))?;
302            let asn = split.next().map(|x| x.parse::<u32>()).transpose()?;
303
304            let defn = NetDefn::new(cc, asn)?;
305
306            ret.map_v4.insert(from..=to, defn);
307        }
308
309        // This is slightly copypasta, but probably less readable to merge into one thing.
310        for line in db_v6.lines() {
311            if line.starts_with('#') {
312                continue;
313            }
314            let line = line.trim();
315            if line.is_empty() {
316                continue;
317            }
318            let mut split = line.split(',');
319            let from = split
320                .next()
321                .ok_or(Error::BadFormat("empty line somehow?"))?
322                .parse::<Ipv6Addr>()?;
323            let to = split
324                .next()
325                .ok_or(Error::BadFormat("line with insufficient commas"))?
326                .parse::<Ipv6Addr>()?;
327            let cc = split
328                .next()
329                .ok_or(Error::BadFormat("line with insufficient commas"))?;
330            let asn = split.next().map(|x| x.parse::<u32>()).transpose()?;
331
332            let defn = NetDefn::new(cc, asn)?;
333
334            ret.map_v6.insert(from.into()..=to.into(), defn);
335        }
336
337        Ok(ret)
338    }
339
340    /// Get the `NetDefn` for an IP address.
341    fn lookup_defn(&self, ip: IpAddr) -> Option<&NetDefn> {
342        match ip {
343            IpAddr::V4(v4) => self.map_v4.get(&v4.into()),
344            IpAddr::V6(v6) => self.map_v6.get(&v6.into()),
345        }
346    }
347
348    /// Get a 2-letter country code for the given IP address, if this data is available.
349    pub fn lookup_country_code(&self, ip: IpAddr) -> Option<&CountryCode> {
350        self.lookup_defn(ip).and_then(|x| x.country_code())
351    }
352
353    /// Determine a 2-letter country code for a host with multiple IP addresses.
354    ///
355    /// This looks up all of the IP addresses with `lookup_country_code`. If the lookups
356    /// return different countries, `None` is returned. IP addresses that fail to resolve
357    /// into a country are ignored if some of the other addresses do resolve successfully.
358    pub fn lookup_country_code_multi<I>(&self, ips: I) -> Option<&CountryCode>
359    where
360        I: IntoIterator<Item = IpAddr>,
361    {
362        let mut ret = None;
363
364        for ip in ips {
365            if let Some(cc) = self.lookup_country_code(ip) {
366                // If we already have a return value and it's different, then return None;
367                // a server can't be in two different countries.
368                if ret.is_some() && ret != Some(cc) {
369                    return None;
370                }
371
372                ret = Some(cc);
373            }
374        }
375
376        ret
377    }
378
379    /// Return the ASN the IP address is in, if this data is available.
380    pub fn lookup_asn(&self, ip: IpAddr) -> Option<u32> {
381        self.lookup_defn(ip)?.asn()
382    }
383}
384
385/// A (representation of a) host on the network which may have a known country code.
386pub trait HasCountryCode {
387    /// Return the country code in which this server is most likely located.
388    ///
389    /// This is usually implemented by simple GeoIP lookup on the addresses provided by `HasAddrs`.
390    /// It follows that the server might not actually be in the returned country, but this is a
391    /// halfway decent estimate for what other servers might guess the server's location to be
392    /// (and thus useful for e.g. getting around simple geo-blocks, or having webpages return
393    /// the correct localised versions).
394    ///
395    /// Returning `None` signifies that no country code information is available. (Conflicting
396    /// GeoIP lookup results might also cause `None` to be returned.)
397    fn country_code(&self) -> Option<CountryCode>;
398}
399
400#[cfg(test)]
401mod test {
402    // @@ begin test lint list maintained by maint/add_warning @@
403    #![allow(clippy::bool_assert_comparison)]
404    #![allow(clippy::clone_on_copy)]
405    #![allow(clippy::dbg_macro)]
406    #![allow(clippy::mixed_attributes_style)]
407    #![allow(clippy::print_stderr)]
408    #![allow(clippy::print_stdout)]
409    #![allow(clippy::single_char_pattern)]
410    #![allow(clippy::unwrap_used)]
411    #![allow(clippy::unchecked_time_subtraction)]
412    #![allow(clippy::useless_vec)]
413    #![allow(clippy::needless_pass_by_value)]
414    //! <!-- @@ end test lint list maintained by maint/add_warning @@ -->
415
416    use super::*;
417    use std::net::Ipv4Addr;
418
419    // NOTE(eta): this test takes a whole 1.6 seconds in *non-release* mode
420    #[test]
421    #[cfg(feature = "embedded-db")]
422    fn embedded_db() {
423        let db = GeoipDb::new_embedded();
424
425        assert_eq!(
426            db.lookup_country_code(Ipv4Addr::new(8, 8, 8, 8).into())
427                .map(|x| x.as_ref()),
428            Some("US")
429        );
430
431        assert_eq!(
432            db.lookup_country_code("2001:4860:4860::8888".parse().unwrap())
433                .map(|x| x.as_ref()),
434            Some("US")
435        );
436    }
437
438    #[test]
439    fn basic_lookups() {
440        let src_v4 = r#"
441        16909056,16909311,GB
442        "#;
443        let src_v6 = r#"
444        fe80::,fe81::,US
445        dead:beef::,dead:ffff::,??
446        "#;
447        let db = GeoipDb::new_from_legacy_format(src_v4, src_v6).unwrap();
448
449        assert_eq!(
450            db.lookup_country_code(Ipv4Addr::new(1, 2, 3, 4).into())
451                .map(|x| x.as_ref()),
452            Some("GB")
453        );
454
455        assert_eq!(
456            db.lookup_country_code(Ipv4Addr::new(1, 1, 1, 1).into()),
457            None
458        );
459
460        assert_eq!(
461            db.lookup_country_code("fe80::dead:beef".parse().unwrap())
462                .map(|x| x.as_ref()),
463            Some("US")
464        );
465
466        assert_eq!(
467            db.lookup_country_code("fe81::dead:beef".parse().unwrap()),
468            None
469        );
470        assert_eq!(
471            db.lookup_country_code("dead:beef::1".parse().unwrap()),
472            None
473        );
474    }
475
476    #[test]
477    fn cc_parse() -> Result<(), Error> {
478        // real countries.
479        assert_eq!(CountryCode::from_str("us")?, CountryCode::from_str("US")?);
480        assert_eq!(CountryCode::from_str("UY")?, CountryCode::from_str("UY")?);
481
482        // not real as of this writing, but still representable.
483        assert_eq!(CountryCode::from_str("A7")?, CountryCode::from_str("a7")?);
484        assert_eq!(CountryCode::from_str("xz")?, CountryCode::from_str("xz")?);
485
486        // Can't convert to two bytes.
487        assert!(matches!(
488            CountryCode::from_str("z"),
489            Err(Error::BadCountryCode(_))
490        ));
491        assert!(matches!(
492            CountryCode::from_str("🐻‍❄️"),
493            Err(Error::BadCountryCode(_))
494        ));
495        assert!(matches!(
496            CountryCode::from_str("Sheboygan"),
497            Err(Error::BadCountryCode(_))
498        ));
499
500        // Can convert to two bytes, but still not printable ascii
501        assert!(matches!(
502            CountryCode::from_str("\r\n"),
503            Err(Error::BadCountryCode(_))
504        ));
505        assert!(matches!(
506            CountryCode::from_str("\0\0"),
507            Err(Error::BadCountryCode(_))
508        ));
509        assert!(matches!(
510            CountryCode::from_str("¡"),
511            Err(Error::BadCountryCode(_))
512        ));
513
514        // Not a country.
515        assert!(matches!(
516            CountryCode::from_str("??"),
517            Err(Error::NowhereNotSupported)
518        ));
519
520        Ok(())
521    }
522
523    #[test]
524    fn opt_cc_parse() -> Result<(), Error> {
525        assert_eq!(
526            CountryCode::from_str("br")?,
527            OptionCc::from_str("BR")?.0.unwrap()
528        );
529        assert!(OptionCc::from_str("??")?.0.is_none());
530
531        Ok(())
532    }
533}