#[macro_use]
mod macros;
mod boolean;
mod check_step;
mod collect;
mod condition;
mod consume;
mod depth;
mod element_condition;
mod error;
mod outcome;
mod paragraph;
mod parser;
mod parser_wrap;
mod result;
mod rule;
mod string;
mod strip;
mod token;
mod prelude {
pub use crate::parsing::{
ExtractedToken, ParseError, ParseErrorKind, ParseResult, ParseSuccess, Token,
};
pub use crate::settings::WikitextSettings;
pub use crate::text::FullText;
pub use crate::tree::{Element, Elements, OwnedElementsIterator};
}
use self::depth::{process_depths, DepthItem, DepthList};
use self::element_condition::{ElementCondition, ElementConditionType};
use self::paragraph::{gather_paragraphs, NO_CLOSE_CONDITION};
use self::parser::Parser;
use self::parser_wrap::ParserWrap;
use self::rule::impls::RULE_PAGE;
use self::string::parse_string;
use self::strip::{strip_newlines, strip_whitespace};
use crate::data::PageInfo;
use crate::next_index::{NextIndex, TableOfContentsIndex};
use crate::settings::WikitextSettings;
use crate::tokenizer::Tokenization;
use crate::tree::{
AttributeMap, BibliographyList, Element, LinkLabel, LinkLocation, LinkType, ListItem,
ListType, SyntaxTree,
};
use std::borrow::Cow;
pub use self::boolean::{parse_boolean, NonBooleanValue};
pub use self::error::{ParseError, ParseErrorKind};
pub use self::outcome::ParseOutcome;
pub use self::result::{ParseResult, ParseSuccess};
pub use self::token::{ExtractedToken, Token};
pub fn parse<'r, 't>(
tokenization: &'r Tokenization<'t>,
page_info: &'r PageInfo<'t>,
settings: &'r WikitextSettings,
) -> ParseOutcome<SyntaxTree<'t>>
where
'r: 't,
{
let UnstructuredParseResult {
result,
table_of_contents_depths,
footnotes,
has_footnote_block,
bibliographies,
} = parse_internal(page_info, settings, tokenization);
let mut incrementer = Incrementer(0);
info!("Finished paragraph gathering, matching on consumption");
match result {
Ok(ParseSuccess {
item: mut elements,
errors,
..
}) => {
info!(
"Finished parsing, producing final syntax tree ({} errors)",
errors.len(),
);
let table_of_contents_depths = table_of_contents_depths
.into_iter()
.map(|(depth, contents)| (depth, (), contents));
let table_of_contents = process_depths((), table_of_contents_depths)
.into_iter()
.map(|(_, items)| build_toc_list_element(&mut incrementer, items))
.collect::<Vec<_>>();
if !has_footnote_block {
info!("No footnote block in elements, appending one");
elements.push(Element::FootnoteBlock {
title: None,
hide: false,
});
}
SyntaxTree::from_element_result(
elements,
errors,
table_of_contents,
footnotes,
bibliographies,
tokenization.full_text().len(),
)
}
Err(error) => {
error!("Fatal error occurred at highest-level parsing: {error:#?}");
let wikitext = tokenization.full_text().inner();
let elements = vec![text!(wikitext)];
let errors = vec![error];
let table_of_contents = vec![];
let footnotes = vec![];
let bibliographies = BibliographyList::new();
SyntaxTree::from_element_result(
elements,
errors,
table_of_contents,
footnotes,
bibliographies,
tokenization.full_text().len(),
)
}
}
}
pub fn parse_internal<'r, 't>(
page_info: &'r PageInfo<'t>,
settings: &'r WikitextSettings,
tokenization: &'r Tokenization<'t>,
) -> UnstructuredParseResult<'r, 't>
where
'r: 't,
{
let mut parser = Parser::new(tokenization, page_info, settings);
info!("Running parser on tokens");
let result = gather_paragraphs(&mut parser, RULE_PAGE, NO_CLOSE_CONDITION);
let table_of_contents_depths = parser.remove_table_of_contents();
let footnotes = parser.remove_footnotes();
let has_footnote_block = parser.has_footnote_block();
let bibliographies = parser.remove_bibliographies();
UnstructuredParseResult {
result,
table_of_contents_depths,
footnotes,
has_footnote_block,
bibliographies,
}
}
fn build_toc_list_element(
incr: &mut Incrementer,
list: DepthList<(), String>,
) -> Element<'static> {
let build_item = |item| match item {
DepthItem::List(_, list) => ListItem::SubList {
element: Box::new(build_toc_list_element(incr, list)),
},
DepthItem::Item(name) => {
let anchor = format!("#toc{}", incr.next());
let link = Element::Link {
ltype: LinkType::TableOfContents,
link: LinkLocation::Url(Cow::Owned(anchor)),
label: LinkLabel::Text(Cow::Owned(name)),
target: None,
};
ListItem::Elements {
elements: vec![link],
attributes: AttributeMap::new(),
}
}
};
let items = list.into_iter().map(build_item).collect();
let attributes = AttributeMap::new();
Element::List {
ltype: ListType::Bullet,
items,
attributes,
}
}
#[derive(Debug)]
struct Incrementer(usize);
impl NextIndex<TableOfContentsIndex> for Incrementer {
fn next(&mut self) -> usize {
let index = self.0;
self.0 += 1;
index
}
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct UnstructuredParseResult<'r, 't> {
pub result: ParseResult<'r, 't, Vec<Element<'t>>>,
pub table_of_contents_depths: Vec<(usize, String)>,
pub footnotes: Vec<Vec<Element<'t>>>,
pub has_footnote_block: bool,
pub bibliographies: BibliographyList<'t>,
}