1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/*!
`parse_mediawiki_sql` parses SQL dumps of a MediaWiki database.
The SQL dumps are scripts that create a database table and insert rows into it.
The entry point is `iterate_sql_insertions`, which creates an iterable struct
from a byte slice (`&[u8]`). The struct is generic over the type returned by the iterator,
and this type must be one of the structs in the [`schemas`](schemas) module,
which represent rows in the database, such as [`Page`](schemas::Page).

## Usage
This crate is available from [crates.io](https://crates.io/crate/parse-mediawiki-sql) and can be
used by adding `parse-mediawiki-sql` to your dependencies in your project's `Cargo.toml`.

```toml
[dependencies]
parse-mediawiki-sql = "0.1"
```

If you're using Rust 2015, then you’ll also need to add it to your crate root:

```no_run
extern crate parse_mediawiki_sql;
```

## Example
To generate a `Vec` containing the titles of all redirect pages:

```no_run
use memmap::Mmap;
use parse_mediawiki_sql::{
    iterate_sql_insertions,
    schemas::Page,
    types::{PageNamespace, PageTitle},
};
use std::fs::File;
let page_sql =
    unsafe { Mmap::map(&File::open("page.sql").unwrap()).unwrap() };
let redirects: Vec<(PageNamespace, PageTitle)> =
    iterate_sql_insertions(&page_sql)
        .filter_map(
            |Page { namespace, title, is_redirect, .. }| {
                if is_redirect {
                    Some((namespace, title))
                } else {
                    None
                }
            },
        )
        .collect();
```

Only a mutable reference to the struct is iterable, so a `for`-loop
must use `&mut` or `.into_iter()` to iterate over the struct:

```no_run
# use parse_mediawiki_sql::{
#     iterate_sql_insertions,
#     schemas::Page,
# };
# use memmap::Mmap;
# use std::fs::File;
# let page_sql =
#     unsafe { Mmap::map(&File::open("page.sql").unwrap()).unwrap() };
for Page { namespace, title, is_redirect, .. } in &mut iterate_sql_insertions(&page_sql) {
    if is_redirect {
        dbg!((namespace, title));
    }
}
```
*/

use bstr::{ByteSlice, B};
use nom::{
    branch::alt,
    bytes::streaming::{tag, take_while},
    character::streaming::multispace0,
    combinator::{iterator, opt, recognize, ParserIterator},
    sequence::{preceded, tuple},
};
pub use types::{Error, IResult};

pub mod schemas;
pub mod types;

/**
Trait for converting from a SQL tuple to a Rust type,
which can borrow from the string or not.
Used by [`iterate_sql_insertions`][crate::iterate_sql_insertions].
*/
pub trait FromSqlTuple<'a>: Sized {
    fn from_sql_tuple(s: &'a [u8]) -> IResult<'a, Self>;
}

/**
Takes a SQL dump of a MediaWiki database table as bytes
and yields a struct that is iterable as a mutable reference,
yielding structs representing the database rows.
*/
pub fn iterate_sql_insertions<'a, T>(
    sql: &'a [u8],
) -> ParserIterator<&'a [u8], Error<'a>, impl Fn(&'a [u8]) -> IResult<'a, T>>
where
    T: FromSqlTuple<'a> + 'a,
{
    let sql = &sql[sql.find("INSERT INTO").expect("INSERT INTO statement")..];
    iterator(
        sql,
        preceded(
            alt((
                recognize(tuple((
                    opt(multispace0),
                    opt(tag(";")),
                    opt(multispace0),
                    tuple((
                        tag(B("INSERT INTO `")),
                        take_while(|b| b'a' <= b && b <= b'z' || b == b'_'),
                        tag(B("` VALUES ")),
                    )),
                ))),
                tag(","),
            )),
            FromSqlTuple::from_sql_tuple,
        ),
    )
}