1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/*!
[![crates.io](https://img.shields.io/crates/v/parse-mediawiki-sql.svg)](https://crates.io/crates/parse-mediawiki-sql)
[![docs.rs](https://img.shields.io/docsrs/parse-mediawiki-sql)](https://docs.rs/parse-mediawiki-sql)

`parse_mediawiki_sql` parses SQL dumps of a MediaWiki database.
The SQL dumps are scripts that create a database table and insert rows into it.
The entry point is `iterate_sql_insertions`, which creates an iterable struct
from a byte slice (`&[u8]`). The struct is generic over the type returned by the iterator,
and this type must be one of the structs in the [`schemas`](schemas) module,
which represent rows in the database, such as [`Page`](schemas::Page).

## Usage
This crate is available from [crates.io](https://crates.io/crate/parse-mediawiki-sql) and can be
used by adding `parse-mediawiki-sql` to your dependencies in your project's `Cargo.toml`.

```toml
[dependencies]
parse-mediawiki-sql = "0.3"
```

If you're using Rust 2015, then you’ll also need to add it to your crate root:

```no_run
extern crate parse_mediawiki_sql;
```

## Example
To generate a `Vec` containing the titles of all redirect pages:

```no_run
use parse_mediawiki_sql::{
    iterate_sql_insertions,
    schemas::Page,
    types::{PageNamespace, PageTitle},
    utils::memory_map,
};
use std::fs::File;
let page_sql = unsafe { memory_map("page.sql")? };
let redirects: Vec<(PageNamespace, PageTitle)> =
    iterate_sql_insertions(&page_sql)
        .filter_map(
            |Page { namespace, title, is_redirect, .. }| {
                if is_redirect {
                    Some((namespace, title))
                } else {
                    None
                }
            },
        )
        .collect();
```

Only a mutable reference to the struct is iterable, so a `for`-loop
must use `&mut` or `.into_iter()` to iterate over the struct:

```no_run
# use parse_mediawiki_sql::{
#     iterate_sql_insertions,
#     schemas::Page,
# };
# use memmap::Mmap;
# use std::fs::File;
# let page_sql =
#     unsafe { Mmap::map(&File::open("page.sql").unwrap()).unwrap() };
for Page { namespace, title, is_redirect, .. } in &mut iterate_sql_insertions(&page_sql) {
    if is_redirect {
        dbg!((namespace, title));
    }
}
```
*/

#![cfg_attr(docsrs, feature(doc_cfg))]

use bstr::{ByteSlice, B};
use nom::{
    branch::alt,
    bytes::streaming::{tag, take_while},
    character::streaming::multispace0,
    combinator::{iterator, opt, recognize, ParserIterator},
    sequence::{preceded, tuple},
};

pub mod error;
pub mod field_types;
pub mod from_sql;
pub mod schemas;

pub use error::Error;
pub use from_sql::IResult;
#[cfg(feature = "utils")]
#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
pub mod utils;

/**
Trait for converting from a SQL tuple to a Rust type,
which can borrow from the string or not.
Used by [`iterate_sql_insertions`][crate::iterate_sql_insertions].
*/
pub trait FromSqlTuple<'a>: Sized {
    fn from_sql_tuple(s: &'a [u8]) -> IResult<'a, Self>;
}

/**
Takes a SQL dump of a MediaWiki database table as bytes
and yields a struct that is iterable as a mutable reference,
yielding structs representing the database rows.
*/
#[must_use = "the return type implements `Iterator` as a mutable reference, and does nothing unless consumed"]
pub fn iterate_sql_insertions<'a, T>(
    sql: &'a [u8],
) -> ParserIterator<&'a [u8], Error<'a>, impl FnMut(&'a [u8]) -> IResult<'a, T>>
where
    T: FromSqlTuple<'a> + 'a,
{
    let sql = &sql[sql.find("INSERT INTO").expect("INSERT INTO statement")..];
    iterator(
        sql,
        preceded(
            alt((
                recognize(tuple((
                    opt(multispace0),
                    opt(tag(";")),
                    opt(multispace0),
                    tuple((
                        tag(B("INSERT INTO `")),
                        take_while(|b| (b'a'..=b'z').contains(&b) || b == b'_'),
                        tag(B("` VALUES ")),
                    )),
                ))),
                tag(","),
            )),
            FromSqlTuple::from_sql_tuple,
        ),
    )
}