parse_mediawiki_sql/
lib.rs

1/*!
2[![crates.io](https://img.shields.io/crates/v/parse-mediawiki-sql.svg)](https://crates.io/crates/parse-mediawiki-sql)
3[![docs.rs](https://img.shields.io/docsrs/parse-mediawiki-sql)](https://docs.rs/parse-mediawiki-sql)
4
5`parse_mediawiki_sql` parses SQL dumps of a MediaWiki database.
6The SQL dumps are scripts that create a database table and insert rows into it.
7The entry point is `iterate_sql_insertions`, which creates an iterable struct
8from a byte slice (`&[u8]`). The struct is generic over the type returned by the iterator,
9and this type must be one of the structs in the [`schemas`](schemas) module,
10which represent rows in the database, such as [`Page`](schemas::Page).
11
12## Usage
13This crate is available from [crates.io](https://crates.io/crate/parse-mediawiki-sql) and can be
14used by adding `parse-mediawiki-sql` to your dependencies in your project's `Cargo.toml`.
15
16```toml
17[dependencies]
18parse-mediawiki-sql = "0.10"
19```
20
21If you're using Rust 2015, then you’ll also need to add it to your crate root:
22
23```no_run
24extern crate parse_mediawiki_sql;
25```
26
27## Example
28To generate a `Vec` containing the titles of all redirect pages:
29
30```no_run
31# #[cfg(feature = "utils")]
32# fn main() -> Result<(), Box<dyn std::error::Error>> {
33use parse_mediawiki_sql::{
34    iterate_sql_insertions,
35    schemas::Page,
36    field_types::{PageNamespace, PageTitle},
37    utils::memory_map,
38};
39use std::fs::File;
40let page_sql = unsafe { memory_map("page.sql")? };
41let redirects: Vec<(PageNamespace, PageTitle)> =
42    iterate_sql_insertions(&page_sql)
43        .filter_map(
44            |Page { namespace, title, is_redirect, .. }| {
45                if is_redirect {
46                    Some((namespace, title))
47                } else {
48                    None
49                }
50            },
51        )
52        .collect();
53# Ok(())
54# }
55# #[cfg(not(feature = "utils"))]
56# fn main() {}
57```
58
59Only a mutable reference to the struct is iterable, so a `for`-loop
60must use `&mut` or `.into_iter()` to iterate over the struct:
61
62```no_run
63# #[cfg(feature = "utils")]
64# fn main() -> Result<(), Box<dyn std::error::Error>> {
65# use parse_mediawiki_sql::{
66#     iterate_sql_insertions,
67#     schemas::Page,
68#     utils::memory_map,
69# };
70# let page_sql =
71#     unsafe { memory_map("page.sql")? };
72for Page { namespace, title, is_redirect, .. } in &mut iterate_sql_insertions(&page_sql) {
73    if is_redirect {
74        dbg!((namespace, title));
75    }
76}
77# Ok(())
78# }
79# #[cfg(not(feature = "utils"))]
80# fn main() {}
81```
82*/
83
84#![cfg_attr(docsrs, feature(doc_cfg))]
85
86use bstr::{ByteSlice, B};
87use nom::{
88    branch::alt,
89    bytes::streaming::{tag, take_while},
90    character::streaming::multispace0,
91    combinator::{iterator, opt, recognize, ParserIterator},
92    sequence::{preceded, tuple},
93};
94
95pub mod error;
96pub mod field_types;
97pub mod from_sql;
98pub mod schemas;
99
100pub use error::Error;
101pub use from_sql::IResult;
102#[cfg(feature = "utils")]
103#[cfg_attr(docsrs, doc(cfg(feature = "utils")))]
104pub mod utils;
105
106/**
107Trait for converting from a SQL tuple to a Rust type,
108which can borrow from the string or not.
109Used by [`iterate_sql_insertions`].
110*/
111pub trait FromSqlTuple<'input>: Sized {
112    fn from_sql_tuple(s: &'input [u8]) -> IResult<'input, Self>;
113}
114
115/**
116The entry point of the crate. Takes a SQL dump of a MediaWiki database table as bytes
117and yields an iterator over structs representing rows in the table.
118
119The return value is iterable as a mutable reference,
120and when iterated it yields structs representing the database rows (`Row`).
121These rows are represented as tuples in the SQL code.
122The tuples are parsed using [`FromSqlTuple::from_sql_tuple`]
123and the fields in the tuples are parsed by [`FromSql::from_sql`](from_sql::FromSql::from_sql).
124
125See the [example][crate#example] in the documentation, and see [`schemas`] for the full list of possible `Row`s.
126*/
127#[must_use = "the return type implements `Iterator` as a mutable reference, and does nothing unless consumed"]
128pub fn iterate_sql_insertions<'input, Row>(
129    sql: &'input [u8],
130) -> ParserIterator<&'input [u8], Error<'input>, impl FnMut(&'input [u8]) -> IResult<'input, Row>>
131where
132    Row: FromSqlTuple<'input> + 'input,
133{
134    let sql = &sql[sql.find("INSERT INTO").expect("INSERT INTO statement")..];
135    iterator(
136        sql,
137        preceded(
138            alt((
139                recognize(tuple((
140                    opt(multispace0),
141                    opt(tag(";")),
142                    opt(multispace0),
143                    tuple((
144                        tag(B("INSERT INTO `")),
145                        take_while(|b| (b'a'..=b'z').contains(&b) || b == b'_'),
146                        tag(B("` VALUES ")),
147                    )),
148                ))),
149                tag(","),
150            )),
151            FromSqlTuple::from_sql_tuple,
152        ),
153    )
154}