use std::fs::File;
use std::io::Cursor;
use std::io::{BufRead, BufReader, Read};
use std::path::{Path, PathBuf};
use byteorder::{LittleEndian, ReadBytesExt};
use md5::{digest::generic_array::GenericArray, Digest, Md5};
use memmap::Mmap;
use crate::cluster::Cluster;
use crate::directory_entry::DirectoryEntry;
use crate::directory_iterator::DirectoryIterator;
use crate::errors::{Error, Result};
use crate::mime_type::MimeType;
use crate::uuid::Uuid;
pub const ZIM_MAGIC_NUMBER: u32 = 72173914;
#[allow(dead_code)]
pub struct Zim {
pub header: ZimHeader,
pub master_view: Mmap,
pub file_path: PathBuf,
pub mime_table: Vec<String>,
pub url_list: Vec<u64>,
pub article_list: Vec<u32>,
pub cluster_list: Vec<u64>,
pub checksum: Checksum,
}
pub type Checksum = GenericArray<u8, <Md5 as Digest>::OutputSize>;
pub struct ZimHeader {
pub version_major: u16,
pub version_minor: u16,
pub uuid: Uuid,
pub article_count: u32,
pub cluster_count: u32,
pub url_ptr_pos: u64,
pub title_ptr_pos: u64,
pub cluster_ptr_pos: u64,
pub mime_list_pos: u64,
pub main_page: Option<u32>,
pub layout_page: Option<u32>,
pub checksum_pos: u64,
pub geo_index_pos: Option<u64>,
}
impl Zim {
pub fn new<P: AsRef<Path>>(p: P) -> Result<Zim> {
let f = File::open(p.as_ref())?;
let master_view = unsafe { Mmap::map(&f)? };
let (header, mime_table) = parse_header(&master_view)?;
let url_list = parse_url_list(&master_view, header.url_ptr_pos, header.article_count)?;
let article_list =
parse_article_list(&master_view, header.title_ptr_pos, header.article_count)?;
let cluster_list =
parse_cluster_list(&master_view, header.cluster_ptr_pos, header.cluster_count)?;
let checksum = read_checksum(&master_view, header.checksum_pos)?;
Ok(Zim {
header,
file_path: p.as_ref().into(),
master_view,
mime_table,
url_list,
article_list,
cluster_list,
checksum,
})
}
pub fn article_count(&self) -> usize {
self.article_list.len()
}
pub fn verify_checksum(&self) -> Result<()> {
let checksum_computed = compute_checksum(&self.file_path, self.header.checksum_pos)?;
if self.checksum != checksum_computed {
return Err(Error::InvalidChecksum);
}
Ok(())
}
pub fn get_mimetype(&self, id: u16) -> Option<MimeType> {
match id {
0xffff => Some(MimeType::Redirect),
0xfffe => Some(MimeType::LinkTarget),
0xfffd => Some(MimeType::DeletedEntry),
id => {
if (id as usize) < self.mime_table.len() {
Some(MimeType::Type(self.mime_table[id as usize].clone()))
} else {
println!("WARNING unknown mimetype idx {}", id);
None
}
}
}
}
pub fn iterate_by_urls(&self) -> DirectoryIterator {
DirectoryIterator::new(self)
}
pub fn get_by_url_index(&self, idx: u32) -> Result<DirectoryEntry> {
let entry_offset = self.url_list[idx as usize] as usize;
let (_, dir_view) = self.master_view.split_at(entry_offset);
DirectoryEntry::new(self, dir_view)
}
pub fn get_cluster(&self, idx: u32) -> Result<Cluster> {
Cluster::new(
&self.master_view,
&self.cluster_list,
idx,
self.header.checksum_pos,
self.header.version_major,
)
}
}
fn is_defined(val: u32) -> Option<u32> {
if val == 0xffffffff {
None
} else {
Some(val)
}
}
fn parse_header(master_view: &Mmap) -> Result<(ZimHeader, Vec<String>)> {
let mut header_cur = Cursor::new(master_view);
let magic = header_cur.read_u32::<LittleEndian>()?;
if magic != ZIM_MAGIC_NUMBER {
return Err(Error::InvalidMagicNumber);
}
let version_major = header_cur.read_u16::<LittleEndian>()?;
if version_major != 5 && version_major != 6 {
return Err(Error::InvalidVersion);
}
let version_minor = header_cur.read_u16::<LittleEndian>()?;
let mut uuid = [0u8; 16];
for i in 0..16 {
uuid[i] = header_cur.read_u8()?;
}
let article_count = header_cur.read_u32::<LittleEndian>()?;
let cluster_count = header_cur.read_u32::<LittleEndian>()?;
let url_ptr_pos = header_cur.read_u64::<LittleEndian>()?;
let title_ptr_pos = header_cur.read_u64::<LittleEndian>()?;
let cluster_ptr_pos = header_cur.read_u64::<LittleEndian>()?;
let mime_list_pos = header_cur.read_u64::<LittleEndian>()?;
let main_page = header_cur.read_u32::<LittleEndian>()?;
let layout_page = header_cur.read_u32::<LittleEndian>()?;
let checksum_pos = header_cur.read_u64::<LittleEndian>()?;
if header_cur.position() != 80 {
return Err(Error::InvalidHeader);
}
let geo_index_pos = if mime_list_pos > 80 {
Some(header_cur.read_u64::<LittleEndian>()?)
} else {
None
};
let mime_table = {
let mut mime_table = Vec::new();
loop {
let mut mime_buf = Vec::new();
if let Ok(size) = header_cur.read_until(0, &mut mime_buf) {
if size <= 1 {
break;
}
mime_buf.truncate(size - 1);
mime_table.push(String::from_utf8(mime_buf)?);
}
}
mime_table
};
Ok((
ZimHeader {
version_major,
version_minor,
uuid: Uuid::new(uuid),
article_count,
cluster_count,
url_ptr_pos,
title_ptr_pos,
cluster_ptr_pos,
mime_list_pos,
main_page: is_defined(main_page),
layout_page: is_defined(layout_page),
checksum_pos,
geo_index_pos,
},
mime_table,
))
}
fn parse_url_list(master_view: &Mmap, ptr_pos: u64, count: u32) -> Result<Vec<u64>> {
let start = ptr_pos as usize;
let end = (ptr_pos + count as u64 * 8) as usize;
let list_view = master_view.get(start..end).ok_or(Error::OutOfBounds)?;
let mut cur = Cursor::new(list_view);
let mut out: Vec<u64> = Vec::new();
for _ in 0..count {
out.push(cur.read_u64::<LittleEndian>()?);
}
Ok(out)
}
fn parse_article_list(master_view: &Mmap, ptr_pos: u64, count: u32) -> Result<Vec<u32>> {
let start = ptr_pos as usize;
let end = (ptr_pos as u32 + count * 4) as usize;
let list_view = master_view.get(start..end).ok_or(Error::OutOfBounds)?;
let mut cur = Cursor::new(list_view);
let mut out: Vec<u32> = Vec::new();
for _ in 0..count {
out.push(cur.read_u32::<LittleEndian>()?);
}
Ok(out)
}
fn parse_cluster_list(master_view: &Mmap, ptr_pos: u64, count: u32) -> Result<Vec<u64>> {
let start = ptr_pos as usize;
let end = (ptr_pos as u32 + count * 8) as usize;
let cluster_list_view = master_view.get(start..end).ok_or(Error::OutOfBounds)?;
let mut cluster_cur = Cursor::new(cluster_list_view);
let mut out: Vec<u64> = Vec::new();
for _ in 0..count {
out.push(cluster_cur.read_u64::<LittleEndian>()?);
}
Ok(out)
}
fn read_checksum(master_view: &Mmap, checksum_pos: u64) -> Result<Checksum> {
match master_view.get(checksum_pos as usize..checksum_pos as usize + 16) {
Some(raw) => {
let mut arr = GenericArray::default();
arr.copy_from_slice(raw);
Ok(arr)
}
None => Err(Error::MissingChecksum),
}
}
fn compute_checksum(path: &Path, checksum_pos: u64) -> Result<Checksum> {
let file = File::open(path)?;
let mut reader = BufReader::new(file.take(checksum_pos));
let mut buffer = vec![0u8; 1024];
let mut hasher = Md5::new();
loop {
let read = reader.read(&mut buffer)?;
if read == 0 {
break;
}
hasher.input(&buffer[..read]);
}
Ok(hasher.result())
}
#[test]
fn test_zim() {
let zim = Zim::new("fixtures/wikipedia_ab_all_2017-03.zim").expect("failed to parse fixture");
assert_eq!(zim.header.version_major, 5);
let cl0 = zim.get_cluster(0).unwrap();
assert_eq!(&cl0.get_blob(0).unwrap()[..], &[97, 98, 107][..]);
let cl1 = zim.get_cluster(zim.header.cluster_count - 1).unwrap();
let b = cl1.get_blob(0).unwrap();
assert_eq!(&b[0..10], &[71, 73, 70, 56, 57, 97, 44, 1, 150, 0]);
assert_eq!(
&b[b.len() - 10..],
&[222, 192, 21, 240, 155, 91, 65, 0, 0, 59]
);
assert_eq!(zim.iterate_by_urls().count(), 3111);
}