from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import struct
import bisect
import socket
import binascii
import sys
import time
METADATA_MARKER = b'\xab\xcd\xefMaxMind.com'
try:
ord(b"1"[0])
except TypeError:
def byte_to_int(b):
"convert a single element of a bytestring to an integer."
return b
else:
byte_to_int = ord
try:
str(b"a", "utf8")
except TypeError:
bytesToStr = str
else:
def bytesToStr(b):
"convert a bytestring in utf8 to a string."
return str(b, 'utf8')
def to_int(s):
"Parse a big-endian integer from bytestring s."
result = 0
for c in s:
result *= 256
result += byte_to_int(c)
return result
def to_int24(s):
"Parse a pair of big-endian 24-bit integers from bytestring s."
a, b, c = struct.unpack("!HHH", s)
return ((a <<8)+(b>>8)), (((b&0xff)<<16)+c)
def to_int32(s):
"Parse a pair of big-endian 32-bit integers from bytestring s."
a, b = struct.unpack("!LL", s)
return a, b
def to_int28(s):
"Parse a pair of big-endian 28-bit integers from bytestring s."
a, b = struct.unpack("!LL", s + b'\x00')
return (((a & 0xf0) << 20) + (a >> 8)), ((a & 0x0f) << 24) + (b >> 8)
class Tree(object):
"Holds a node in the tree"
def __init__(self, left, right):
self.left = left
self.right = right
def resolve_tree(tree, data):
d = Datum(None, None, None, None)
def resolve_item(item):
"Helper: resolve a single index."
if item < len(tree):
return tree[item]
elif item == len(tree):
return None
else:
d.pos = (item - len(tree) - 16)
p = bisect.bisect_left(data, d)
assert data[p].pos == d.pos
return data[p]
for t in tree:
t.left_item = resolve_item(t.left)
t.right_item = resolve_item(t.right)
def parse_search_tree(s, record_size):
record_bytes = (record_size*2) // 8
nodes = []
p = 0
try:
to_leftright = { 24: to_int24,
28: to_int28,
32: to_int32 }[ record_size ]
except KeyError:
raise NotImplementedError("Unsupported record size in bits: %d" %
record_size)
while p < len(s):
left, right = to_leftright(s[p:p+record_bytes])
p += record_bytes
nodes.append( Tree(left, right ) )
return nodes
class Datum(object):
def __init__(self, pos, kind, ln, data):
self.pos = pos self.kind = kind self.ln = ln self.data = data self.children = None
def __repr__(self):
return "Datum(%r,%r,%r,%r)" % (self.pos, self.kind, self.ln, self.data)
def __lt__(self, other):
return self.pos < other.pos
def __gt__(self, other):
return self.pos > other.pos
def __eq__(self, other):
return self.pos == other.pos
def build_maps(self):
if not hasattr(self, 'nChildren'):
return
if self.kind == TP_ARRAY:
del self.nChildren
for c in self.children:
c.build_maps()
elif self.kind == TP_MAP:
del self.nChildren
self.map = {}
for i in range(0, len(self.children), 2):
k = self.children[i].deref()
v = self.children[i+1].deref()
v.build_maps()
if k.kind != TP_UTF8:
raise ValueError("Bad dictionary key type %d"% k.kind)
self.map[bytesToStr(k.data)] = v
def int_val(self):
assert self.kind in (TP_UINT16, TP_UINT32, TP_UINT64,
TP_UINT128, TP_SINT32)
i = to_int(self.data)
if self.kind == TP_SINT32:
if i & 0x80000000:
i = i - 0x100000000
return i
def deref(self):
n = 0
s = self
while s.kind == TP_PTR:
s = s.ptr
n += 1
assert n < 100
return s
def resolve_pointers(data):
search = Datum(None, None, None, None)
for d in data:
if d.kind == TP_PTR:
search.pos = d.ln
p = bisect.bisect_left(data, search)
assert data[p].pos == d.ln
d.ptr = data[p]
TP_PTR = 1
TP_UTF8 = 2
TP_DBL = 3
TP_BYTES = 4
TP_UINT16 = 5
TP_UINT32 = 6
TP_MAP = 7
TP_SINT32 = 8
TP_UINT64 = 9
TP_UINT128 = 10
TP_ARRAY = 11
TP_DCACHE = 12
TP_END = 13
TP_BOOL = 14
TP_FLOAT = 15
def get_type_and_len(s):
c = byte_to_int(s[0])
tp = c >> 5
skip = 1
if tp == 0:
tp = byte_to_int(s[1])+7
skip = 2
ln = c & 31
if tp == TP_PTR:
len_len = (ln >> 3) + 1
if len_len < 4:
ln &= 7
ln <<= len_len * 8
else:
ln = 0
ln += to_int(s[skip:skip+len_len])
ln += (0, 0, 2048, 526336, 0)[len_len]
skip += len_len
elif ln >= 29:
len_len = ln - 28
ln = to_int(s[skip:skip+len_len])
ln += (0, 29, 285, 65821)[len_len]
skip += len_len
return tp, ln, skip
IGNORE_LEN_TYPES = set([
TP_MAP, TP_ARRAY, TP_PTR, TP_BOOL, TP_DCACHE, ])
def parse_data_section(s):
stack = []
data = []
pos = 0
while s:
tp, ln, skip = get_type_and_len(s)
if tp in IGNORE_LEN_TYPES:
real_len = 0
else:
real_len = ln
d = Datum(pos, tp, ln, s[skip:skip+real_len])
data.append(d)
pos += skip+real_len
s = s[skip+real_len:]
if stack:
stack[-1].children.append(d)
stack[-1].nChildren -= 1
if stack[-1].nChildren == 0:
del stack[-1]
if d.kind == TP_ARRAY:
d.nChildren = d.ln
d.children = []
stack.append(d)
elif d.kind == TP_MAP:
d.nChildren = d.ln * 2
d.children = []
stack.append(d)
return data
def parse_mm_file(s):
try:
metadata_ptr = s.rindex(METADATA_MARKER)
except ValueError:
raise ValueError("No metadata!")
metadata = parse_data_section(s[metadata_ptr+len(METADATA_MARKER):])
if metadata[0].kind != TP_MAP:
raise ValueError("Bad map")
metadata[0].build_maps()
mm = metadata[0].map
tree_size = (((mm['record_size'].int_val() * 2) // 8 ) *
mm['node_count'].int_val())
if s[tree_size:tree_size+16] != b'\x00'*16:
raise ValueError("Missing section separator!")
tree = parse_search_tree(s[:tree_size], mm['record_size'].int_val())
data = parse_data_section(s[tree_size+16:metadata_ptr])
resolve_pointers(data)
resolve_tree(tree, data)
for d in data:
d.build_maps()
return metadata, tree, data
def format_datum(datum):
try:
return bytesToStr(datum.map['country'].map['iso_code'].data)
except KeyError:
pass
try:
return bytesToStr(datum.map['registered_country'].map['iso_code'].data)
except KeyError:
pass
return None
IPV4_PREFIX = "0"*96
def dump_item_ipv4(entries, prefix, val):
if not prefix.startswith(IPV4_PREFIX):
return
prefix = prefix[96:]
v = int(prefix, 2)
shift = 32 - len(prefix)
lo = v << shift
hi = ((v+1) << shift) - 1
entries.append((lo, hi, val))
def fmt_item_ipv4(entry):
return "%d,%d,%s\n"%(entry[0], entry[1], entry[2])
def fmt_ipv6_addr(v):
return socket.inet_ntop(socket.AF_INET6, binascii.unhexlify("%032x"%v))
def fmt_item_ipv6(entry):
return "%s,%s,%s\n"%(fmt_ipv6_addr(entry[0]),
fmt_ipv6_addr(entry[1]),
entry[2])
IPV4_MAPPED_IPV6_PREFIX = "0"*80 + "1"*16
IPV6_6TO4_PREFIX = "0010000000000010"
TEREDO_IPV6_PREFIX = "0010000000000001" + "0"*16
def dump_item_ipv6(entries, prefix, val):
if prefix.startswith(IPV4_PREFIX) or \
prefix.startswith(IPV4_MAPPED_IPV6_PREFIX) or \
prefix.startswith(IPV6_6TO4_PREFIX) or \
prefix.startswith(TEREDO_IPV6_PREFIX):
return
v = int(prefix, 2)
shift = 128 - len(prefix)
lo = v << shift
hi = ((v+1) << shift) - 1
entries.append((lo, hi, val))
def dump_tree(entries, node, dump_item, prefix=""):
if isinstance(node, Tree):
dump_tree(entries, node.left_item, dump_item, prefix+"0")
dump_tree(entries, node.right_item, dump_item, prefix+"1")
elif isinstance(node, Datum):
assert node.kind == TP_MAP
code = format_datum(node)
if code:
dump_item(entries, prefix, code)
else:
assert node == None
GEOIP_FILE_HEADER = """\
# Last updated based on %s Maxmind GeoLite2 Country
# wget https://geolite.maxmind.com/download/geoip/database/GeoLite2-Country.mmdb.gz
# gunzip GeoLite2-Country.mmdb.gz
# python mmdb-convert.py GeoLite2-Country.mmdb
"""
def write_geoip_file(filename, metadata, the_tree, dump_item, fmt_item):
entries = []
dump_tree(entries, the_tree[0], dump_item)
fobj = open(filename, 'w')
build_epoch = metadata[0].map['build_epoch'].int_val()
fobj.write(GEOIP_FILE_HEADER %
time.strftime('%B %-d %Y', time.gmtime(build_epoch)))
unwritten = None
for entry in entries:
if not unwritten:
unwritten = entry
elif unwritten[1] + 1 == entry[0] and unwritten[2] == entry[2]:
unwritten = (unwritten[0], entry[1], unwritten[2])
else:
fobj.write(fmt_item(unwritten))
unwritten = entry
if unwritten:
fobj.write(fmt_item(unwritten))
fobj.close()
content = open(sys.argv[1], 'rb').read()
metadata, the_tree, _ = parse_mm_file(content)
write_geoip_file('geoip', metadata, the_tree, dump_item_ipv4, fmt_item_ipv4)
write_geoip_file('geoip6', metadata, the_tree, dump_item_ipv6, fmt_item_ipv6)