Skip to main content

tor_geoip/
lib.rs

1//! A crate for performing GeoIP lookups using the Tor GeoIP database.
2
3// @@ begin lint list maintained by maint/add_warning @@
4#![allow(renamed_and_removed_lints)] // @@REMOVE_WHEN(ci_arti_stable)
5#![allow(unknown_lints)] // @@REMOVE_WHEN(ci_arti_nightly)
6#![warn(missing_docs)]
7#![warn(noop_method_call)]
8#![warn(unreachable_pub)]
9#![warn(clippy::all)]
10#![deny(clippy::await_holding_lock)]
11#![deny(clippy::cargo_common_metadata)]
12#![deny(clippy::cast_lossless)]
13#![deny(clippy::checked_conversions)]
14#![warn(clippy::cognitive_complexity)]
15#![deny(clippy::debug_assert_with_mut_call)]
16#![deny(clippy::exhaustive_enums)]
17#![deny(clippy::exhaustive_structs)]
18#![deny(clippy::expl_impl_clone_on_copy)]
19#![deny(clippy::fallible_impl_from)]
20#![deny(clippy::implicit_clone)]
21#![deny(clippy::large_stack_arrays)]
22#![warn(clippy::manual_ok_or)]
23#![deny(clippy::missing_docs_in_private_items)]
24#![warn(clippy::needless_borrow)]
25#![warn(clippy::needless_pass_by_value)]
26#![warn(clippy::option_option)]
27#![deny(clippy::print_stderr)]
28#![deny(clippy::print_stdout)]
29#![warn(clippy::rc_buffer)]
30#![deny(clippy::ref_option_ref)]
31#![warn(clippy::semicolon_if_nothing_returned)]
32#![warn(clippy::trait_duplication_in_bounds)]
33#![deny(clippy::unchecked_time_subtraction)]
34#![deny(clippy::unnecessary_wraps)]
35#![warn(clippy::unseparated_literal_suffix)]
36#![deny(clippy::unwrap_used)]
37#![deny(clippy::mod_module_files)]
38#![allow(clippy::let_unit_value)] // This can reasonably be done for explicitness
39#![allow(clippy::uninlined_format_args)]
40#![allow(clippy::significant_drop_in_scrutinee)] // arti/-/merge_requests/588/#note_2812945
41#![allow(clippy::result_large_err)] // temporary workaround for arti#587
42#![allow(clippy::needless_raw_string_hashes)] // complained-about code is fine, often best
43#![allow(clippy::needless_lifetimes)] // See arti#1765
44#![allow(mismatched_lifetime_syntaxes)] // temporary workaround for arti#2060
45#![allow(clippy::collapsible_if)] // See arti#2342
46#![deny(clippy::unused_async)]
47#![deny(clippy::string_slice)] // See arti#2571
48//! <!-- @@ end lint list maintained by maint/add_warning @@ -->
49
50// TODO #1645 (either remove this, or decide to have it everywhere)
51#![cfg_attr(not(all(feature = "full")), allow(unused))]
52
53use crate::dense_range_map::DenseRangeMap;
54pub use crate::err::Error;
55use std::fmt::{Debug, Display, Formatter};
56use std::net::{IpAddr, Ipv6Addr};
57use std::num::{NonZeroU16, NonZeroU32};
58use std::ops::RangeInclusive;
59use std::str::FromStr;
60use std::sync::{Arc, OnceLock};
61
62mod dense_range_map;
63mod err;
64
65/// A parsed copy of the embedded database.
66#[cfg(feature = "embedded-db")]
67static EMBEDDED_DB_PARSED: OnceLock<Arc<GeoipDb>> = OnceLock::new();
68
69/// A two-letter country code.
70///
71/// Specifically, this type represents a purported "ISO 3166-1 alpha-2" country
72/// code, such as "IT" for Italy or "UY" for Uruguay.
73///
74/// It does not include the sentinel value `??` that we use to represent
75/// "country unknown"; if you need that, use [`OptionCc`]. Other than that, we
76/// do not check whether the country code represents a real country: we only
77/// ensure that it is a pair of printing ASCII characters.
78///
79/// Note that the geoip databases included with Arti will only include real
80/// countries; we do not include the pseudo-countries `A1` through `An` for
81/// "anonymous proxies", since doing so would mean putting nearly all Tor relays
82/// into one of those countries.
83#[derive(Copy, Clone, Eq, PartialEq)]
84#[repr(transparent)]
85pub struct CountryCode {
86    /// The underlying value (two printable ASCII characters, stored uppercase).
87    ///
88    /// The special value `??` is excluded, since it is not a country; use
89    /// `OptionCc` instead if you need to represent that.
90    ///
91    /// We store these as `NonZeroU16` so that an `Option<CountryCode>` only has to
92    /// take 2 bytes. This helps with alignment and storage.
93    ///
94    /// (We use a `NonZeroU16` rather than `[NonZeroU8; 2]` to ensure that every
95    /// bit representation is a valid `Option<CountryCode>`.)
96    inner: NonZeroU16,
97}
98
99impl CountryCode {
100    /// Make a new `CountryCode`.
101    fn new(cc_orig: &str) -> Result<Self, Error> {
102        /// Try to convert an array of 2 bytes into a NonZeroU16.
103        #[inline]
104        fn try_cvt_to_nz(inp: [u8; 2]) -> Result<NonZeroU16, Error> {
105            if inp[0] == 0 || inp[1] == 0 {
106                return Err(Error::BadCountryCode("Country code contained NULs".into()));
107            }
108            Ok(u16::from_ne_bytes(inp)
109                .try_into()
110                .expect("zero arrived surprisingly"))
111        }
112
113        let cc = cc_orig.to_ascii_uppercase();
114
115        let cc: [u8; 2] = cc
116            .as_bytes()
117            .try_into()
118            .map_err(|_| Error::BadCountryCode(cc))?;
119
120        if !cc.iter().all(|b| b.is_ascii() && !b.is_ascii_control()) {
121            return Err(Error::BadCountryCode(cc_orig.to_owned()));
122        }
123
124        if &cc == b"??" {
125            return Err(Error::NowhereNotSupported);
126        }
127
128        Ok(Self {
129            inner: try_cvt_to_nz(cc).map_err(|_| Error::BadCountryCode(cc_orig.to_owned()))?,
130        })
131    }
132
133    /// Get the actual country code.
134    ///
135    /// This just calls `.as_ref()`.
136    pub fn get(&self) -> &str {
137        self.as_ref()
138    }
139}
140
141impl Display for CountryCode {
142    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
143        write!(f, "{}", self.as_ref())
144    }
145}
146
147impl Debug for CountryCode {
148    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
149        write!(f, "CountryCode(\"{}\")", self.as_ref())
150    }
151}
152
153impl AsRef<str> for CountryCode {
154    fn as_ref(&self) -> &str {
155        /// Convert a reference to a NonZeroU16 to a reference to
156        /// an array of 2 bytes.
157        #[inline]
158        fn cvt_ref(inp: &NonZeroU16) -> &[u8; 2] {
159            // SAFETY: Every NonZeroU16 has a layout, alignment, and bit validity that is
160            // also a valid [u8; 2].  The layout of arrays is also guaranteed.
161            //
162            // (We don't use try_into here because we need to return a str that
163            // points to a reference to self.)
164            let slice: &[NonZeroU16] = std::slice::from_ref(inp);
165            let (_, slice, _) = unsafe { slice.align_to::<u8>() };
166            slice
167                .try_into()
168                .expect("the resulting slice should have the correct length!")
169        }
170
171        // This shouldn't ever panic, since we shouldn't feed non-utf8 country
172        // codes in.
173        //
174        // In theory we could use from_utf8_unchecked, but that's probably not
175        // needed.
176        std::str::from_utf8(cvt_ref(&self.inner)).expect("invalid country code in CountryCode")
177    }
178}
179
180impl FromStr for CountryCode {
181    type Err = Error;
182
183    fn from_str(s: &str) -> Result<Self, Self::Err> {
184        CountryCode::new(s)
185    }
186}
187
188/// Wrapper for an `Option<`[`CountryCode`]`>` that encodes `None` as `??`.
189///
190/// Used so that we can implement foreign traits.
191#[derive(
192    Copy, Clone, Debug, Eq, PartialEq, derive_more::Into, derive_more::From, derive_more::AsRef,
193)]
194#[allow(clippy::exhaustive_structs)]
195pub struct OptionCc(pub Option<CountryCode>);
196
197impl FromStr for OptionCc {
198    type Err = Error;
199
200    fn from_str(s: &str) -> Result<Self, Self::Err> {
201        match CountryCode::new(s) {
202            Err(Error::NowhereNotSupported) => Ok(None.into()),
203            Err(e) => Err(e),
204            Ok(cc) => Ok(Some(cc).into()),
205        }
206    }
207}
208
209impl Display for OptionCc {
210    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
211        match self.0 {
212            Some(cc) => write!(f, "{}", cc),
213            None => write!(f, "??"),
214        }
215    }
216}
217
218/// The type of an ASN.
219type Asn = NonZeroU32;
220
221/// A database of IP addresses to country codes.
222#[derive(Clone, Eq, PartialEq, Debug)]
223pub struct GeoipDb {
224    /// The IPv4 subset of the database, with v4 addresses stored as 32-bit integers.
225    map_v4: DenseRangeMap<u32, CountryCode, Asn>,
226    /// The IPv6 subset of the database, with v6 addresses stored as 128-bit integers.
227    map_v6: DenseRangeMap<u128, CountryCode, Asn>,
228}
229
230impl GeoipDb {
231    /// Make a new `GeoipDb` using a compiled-in copy of the GeoIP database.
232    ///
233    /// The returned instance of the database is shared with `Arc` across all invocations of this
234    /// function in the same program.
235    #[cfg(feature = "embedded-db")]
236    pub fn new_embedded() -> Arc<Self> {
237        Arc::clone(EMBEDDED_DB_PARSED.get_or_init(|| {
238            use tor_geoip_db as db;
239            fn cvt_ccs(ccs: &'static [Option<NonZeroU16>]) -> &'static [Option<CountryCode>] {
240                // SAFETY: CountryCode is a repr(transparent) for NonZeroU16.
241                let (pre, data, post) = unsafe { ccs.align_to::<Option<CountryCode>>() };
242                assert!(pre.is_empty());
243                assert!(post.is_empty());
244                data
245            }
246
247            let map_v4 = DenseRangeMap::from_static_parts(db::ipv4s(), cvt_ccs(db::ipv4c()), None);
248            let map_v6 = DenseRangeMap::from_static_parts(db::ipv6s(), cvt_ccs(db::ipv6c()), None);
249
250            Arc::new(
251                // It's reasonable to assume the one we embedded is fine --
252                // we'll test it in CI, etc.
253                GeoipDb { map_v4, map_v6 },
254            )
255        }))
256    }
257
258    /// Make a new `GeoipDb` using provided copies of the v4 and v6 database, in Tor legacy format.
259    pub fn new_from_legacy_format(
260        db_v4: &str,
261        db_v6: &str,
262        include_asn: bool,
263    ) -> Result<Self, Error> {
264        let discard_asn = !include_asn;
265        let map_v4 = DenseRangeMap::try_from_sorted_inclusive_ranges(
266            db_v4
267                .lines()
268                .filter_map(|line| parse_line::<u32>(line).transpose()),
269            discard_asn,
270        )?;
271
272        let map_v6 = DenseRangeMap::try_from_sorted_inclusive_ranges(
273            db_v6
274                .lines()
275                .filter_map(|line| parse_line::<Ipv6Addr>(line).transpose()),
276            discard_asn,
277        )?;
278
279        Ok(Self { map_v4, map_v6 })
280    }
281
282    /// Return the database in a raw format suitable for embedding.
283    ///
284    /// This method and the format it returns are unstable.
285    /// This method should only be used for maintaining the database.
286    #[cfg(feature = "export")]
287    #[allow(clippy::type_complexity)]
288    pub fn export_raw(&self) -> RawGeoipDbExport {
289        let (ipv4_starts, ipv4_ccs, ipv4_asns) = self.map_v4.export();
290        let (ipv6_starts, ipv6_ccs, ipv6_asns) = self.map_v6.export();
291
292        RawGeoipDbExport {
293            ipv4_starts,
294            ipv4_ccs,
295            ipv4_asns,
296            ipv6_starts,
297            ipv6_ccs,
298            ipv6_asns,
299        }
300    }
301
302    /// Get a 2-letter country code for the given IP address, if this data is available.
303    pub fn lookup_country_code(&self, ip: IpAddr) -> Option<&CountryCode> {
304        match ip {
305            IpAddr::V4(v4) => self.map_v4.get1(&v4.into()),
306            IpAddr::V6(v6) => self.map_v6.get1(&v6.into()),
307        }
308    }
309
310    /// Determine a 2-letter country code for a host with multiple IP addresses.
311    ///
312    /// This looks up all of the IP addresses with `lookup_country_code`. If the lookups
313    /// return different countries, `None` is returned. IP addresses that fail to resolve
314    /// into a country are ignored if some of the other addresses do resolve successfully.
315    pub fn lookup_country_code_multi<I>(&self, ips: I) -> Option<&CountryCode>
316    where
317        I: IntoIterator<Item = IpAddr>,
318    {
319        let mut ret = None;
320
321        for ip in ips {
322            if let Some(cc) = self.lookup_country_code(ip) {
323                // If we already have a return value and it's different, then return None;
324                // a server can't be in two different countries.
325                if ret.is_some() && ret != Some(cc) {
326                    return None;
327                }
328
329                ret = Some(cc);
330            }
331        }
332
333        ret
334    }
335
336    /// Return the ASN the IP address is in, if this data is available.
337    pub fn lookup_asn(&self, ip: IpAddr) -> Option<u32> {
338        let cc = match ip {
339            IpAddr::V4(v4) => self.map_v4.get2(&v4.into()),
340            IpAddr::V6(v6) => self.map_v6.get2(&v6.into()),
341        };
342        cc.map(|nz| nz.get())
343    }
344}
345
346/// A type that can be an address entry in one of our databases.
347trait DbAddress: FromStr {
348    /// The integer that we use to represent this kind of address.
349    type Int;
350
351    /// Convert this address to an integer.
352    fn to_int(&self) -> Self::Int;
353}
354
355impl DbAddress for u32 {
356    type Int = u32;
357
358    fn to_int(&self) -> Self::Int {
359        *self
360    }
361}
362
363impl DbAddress for Ipv6Addr {
364    type Int = u128;
365
366    fn to_int(&self) -> Self::Int {
367        (*self).into()
368    }
369}
370
371/// A line as returned by [`parse_line`].
372type ParsedLine<T> = (RangeInclusive<T>, Option<CountryCode>, Option<Asn>);
373
374/// Parse a single line from a database, expecting addresses of type T.
375///
376/// Return Ok(None) if the line is empty.
377fn parse_line<T: DbAddress>(line: &str) -> Result<Option<ParsedLine<T::Int>>, Error>
378where
379    Error: From<<T as FromStr>::Err>,
380{
381    if line.starts_with('#') {
382        return Ok(None);
383    }
384    let line = line.trim();
385    if line.is_empty() {
386        return Ok(None);
387    }
388
389    let mut split = line.split(',');
390    let from = split
391        .next()
392        .ok_or(Error::BadFormat("empty line somehow?".into()))?
393        .parse::<T>()?
394        .to_int();
395    let to = split
396        .next()
397        .ok_or(Error::BadFormat("line with insufficient commas".into()))?
398        .parse::<T>()?
399        .to_int();
400    let cc = split
401        .next()
402        .ok_or(Error::BadFormat("line with insufficient commas".into()))?;
403    let cc = match cc {
404        "" => None,
405        cc => OptionCc::from_str(cc)?.0,
406    };
407    let asn = split.next().map(|x| x.parse::<u32>()).transpose()?;
408    // Treat "0" as "no asn".
409    let asn = asn.map(NonZeroU32::try_from).transpose().ok().flatten();
410
411    Ok(Some((from..=to, cc, asn)))
412}
413
414/// A (representation of a) host on the network which may have a known country code.
415pub trait HasCountryCode {
416    /// Return the country code in which this server is most likely located.
417    ///
418    /// This is usually implemented by simple GeoIP lookup on the addresses provided by `HasAddrs`.
419    /// It follows that the server might not actually be in the returned country, but this is a
420    /// halfway decent estimate for what other servers might guess the server's location to be
421    /// (and thus useful for e.g. getting around simple geo-blocks, or having webpages return
422    /// the correct localised versions).
423    ///
424    /// Returning `None` signifies that no country code information is available. (Conflicting
425    /// GeoIP lookup results might also cause `None` to be returned.)
426    fn country_code(&self) -> Option<CountryCode>;
427}
428
429/// An export of a GeoIp database in a raw format suitable for embedding.
430///
431/// This format is deliberately undocumented, and not for other uses.
432#[cfg(feature = "export")]
433#[allow(clippy::exhaustive_structs, missing_docs)]
434pub struct RawGeoipDbExport<'a> {
435    pub ipv4_starts: &'a [u32],
436    pub ipv4_ccs: &'a [Option<CountryCode>],
437    pub ipv4_asns: Option<&'a [Option<NonZeroU32>]>,
438    pub ipv6_starts: &'a [u128],
439    pub ipv6_ccs: &'a [Option<CountryCode>],
440    pub ipv6_asns: Option<&'a [Option<NonZeroU32>]>,
441}
442
443#[cfg(feature = "export")]
444impl<'a> RawGeoipDbExport<'a> {
445    /// Save the contents of this export into a set of data files in "Path".
446    pub fn save(&self, path: &std::path::Path) -> std::io::Result<()> {
447        use std::fs::write;
448        fn into_bytes<'a, T>(data: &'a [T]) -> &'a [u8] {
449            // SAFETY: Every possible bit sequence is a valid u8.
450            let (pre, data, post) = unsafe { data.align_to::<u8>() };
451            assert!(pre.is_empty());
452            assert!(post.is_empty());
453            data
454        }
455        write(path.join("geoip_data_v4s"), into_bytes(self.ipv4_starts))?;
456        write(path.join("geoip_data_v4c"), into_bytes(self.ipv4_ccs))?;
457        if let Some(asns) = self.ipv4_asns {
458            write(path.join("geoip_data_v4a"), into_bytes(asns))?;
459        }
460        write(path.join("geoip_data_v6s"), into_bytes(self.ipv6_starts))?;
461        write(path.join("geoip_data_v6c"), into_bytes(self.ipv6_ccs))?;
462        if let Some(asns) = self.ipv6_asns {
463            write(path.join("geoip_data_v6a"), into_bytes(asns))?;
464        }
465        Ok(())
466    }
467}
468
469#[cfg(test)]
470mod test {
471    // @@ begin test lint list maintained by maint/add_warning @@
472    #![allow(clippy::bool_assert_comparison)]
473    #![allow(clippy::clone_on_copy)]
474    #![allow(clippy::dbg_macro)]
475    #![allow(clippy::mixed_attributes_style)]
476    #![allow(clippy::print_stderr)]
477    #![allow(clippy::print_stdout)]
478    #![allow(clippy::single_char_pattern)]
479    #![allow(clippy::unwrap_used)]
480    #![allow(clippy::unchecked_time_subtraction)]
481    #![allow(clippy::useless_vec)]
482    #![allow(clippy::needless_pass_by_value)]
483    #![allow(clippy::string_slice)] // See arti#2571
484    //! <!-- @@ end test lint list maintained by maint/add_warning @@ -->
485
486    use super::*;
487    use std::net::Ipv4Addr;
488
489    // NOTE(eta): this test takes a whole 1.6 seconds in *non-release* mode
490    #[test]
491    #[cfg(feature = "embedded-db")]
492    fn embedded_db() {
493        let db = GeoipDb::new_embedded();
494
495        assert_eq!(
496            db.lookup_country_code(Ipv4Addr::new(8, 8, 8, 8).into())
497                .map(|x| x.as_ref()),
498            Some("US")
499        );
500
501        assert_eq!(
502            db.lookup_country_code("2001:4860:4860::8888".parse().unwrap())
503                .map(|x| x.as_ref()),
504            Some("US")
505        );
506    }
507
508    #[test]
509    fn cc_rep() {
510        let italy = CountryCode::new("IT").unwrap();
511        assert_eq!(italy.as_ref(), "IT");
512    }
513
514    #[test]
515    fn basic_lookups() {
516        let src_v4 = r#"
517        16909056,16909311,GB
518        "#;
519        let src_v6 = r#"
520        dead:beef::,dead:ffff::,??
521        fe80::,fe81::,US
522        "#;
523        let db = GeoipDb::new_from_legacy_format(src_v4, src_v6, true).unwrap();
524
525        assert_eq!(
526            db.lookup_country_code(Ipv4Addr::new(1, 2, 3, 4).into())
527                .map(|x| x.as_ref()),
528            Some("GB")
529        );
530
531        assert_eq!(
532            db.lookup_country_code(Ipv4Addr::new(1, 1, 1, 1).into()),
533            None
534        );
535
536        assert_eq!(
537            db.lookup_country_code("fe80::dead:beef".parse().unwrap())
538                .map(|x| x.as_ref()),
539            Some("US")
540        );
541
542        assert_eq!(
543            db.lookup_country_code("fe81::dead:beef".parse().unwrap()),
544            None
545        );
546        assert_eq!(
547            db.lookup_country_code("dead:beef::1".parse().unwrap()),
548            None
549        );
550    }
551
552    #[test]
553    fn cc_parse() -> Result<(), Error> {
554        // real countries.
555        assert_eq!(CountryCode::from_str("us")?, CountryCode::from_str("US")?);
556        assert_eq!(CountryCode::from_str("UY")?, CountryCode::from_str("UY")?);
557
558        // not real as of this writing, but still representable.
559        assert_eq!(CountryCode::from_str("A7")?, CountryCode::from_str("a7")?);
560        assert_eq!(CountryCode::from_str("xz")?, CountryCode::from_str("xz")?);
561
562        // Can't convert to two bytes.
563        assert!(matches!(
564            CountryCode::from_str("z"),
565            Err(Error::BadCountryCode(_))
566        ));
567        assert!(matches!(
568            CountryCode::from_str("🐻‍❄️"),
569            Err(Error::BadCountryCode(_))
570        ));
571        assert!(matches!(
572            CountryCode::from_str("Sheboygan"),
573            Err(Error::BadCountryCode(_))
574        ));
575
576        // Can convert to two bytes, but still not printable ascii
577        assert!(matches!(
578            CountryCode::from_str("\r\n"),
579            Err(Error::BadCountryCode(_))
580        ));
581        assert!(matches!(
582            CountryCode::from_str("\0\0"),
583            Err(Error::BadCountryCode(_))
584        ));
585        assert!(matches!(
586            CountryCode::from_str("¡"),
587            Err(Error::BadCountryCode(_))
588        ));
589
590        // Not a country.
591        assert!(matches!(
592            CountryCode::from_str("??"),
593            Err(Error::NowhereNotSupported)
594        ));
595
596        Ok(())
597    }
598
599    #[test]
600    fn opt_cc_parse() -> Result<(), Error> {
601        assert_eq!(
602            CountryCode::from_str("br")?,
603            OptionCc::from_str("BR")?.0.unwrap()
604        );
605        assert!(OptionCc::from_str("??")?.0.is_none());
606
607        Ok(())
608    }
609}