#[macro_use]
mod macros;
mod boolean;
mod check_step;
mod collect;
mod condition;
mod consume;
mod depth;
mod element_condition;
mod error;
mod outcome;
mod paragraph;
mod parser;
mod parser_wrap;
mod result;
mod rule;
mod string;
mod strip;
mod token;
mod prelude {
pub use crate::parsing::{
ExtractedToken, ParseError, ParseErrorKind, ParseResult, ParseSuccess, Token,
};
pub use crate::settings::WikitextSettings;
pub use crate::text::FullText;
pub use crate::tree::{Element, Elements};
}
use self::depth::{DepthItem, DepthList, process_depths};
use self::element_condition::{ElementCondition, ElementConditionType};
use self::paragraph::{NO_CLOSE_CONDITION, gather_paragraphs};
use self::parser::Parser;
use self::parser_wrap::ParserWrap;
use self::rule::impls::RULE_PAGE;
use self::strip::{strip_newlines, strip_whitespace};
use crate::data::PageInfo;
use crate::next_index::{Incrementer, NextIndex};
use crate::settings::WikitextSettings;
use crate::tokenizer::Tokenization;
use crate::tree::{
AttributeMap, BibliographyList, CodeBlock, Element, LinkLabel, LinkLocation,
LinkType, ListItem, ListType, SyntaxTree,
};
use std::borrow::Cow;
pub use self::boolean::{NonBooleanValue, parse_boolean};
pub use self::error::{ParseError, ParseErrorKind};
pub use self::outcome::ParseOutcome;
pub use self::result::{ParseResult, ParseSuccess};
pub use self::token::{ExtractedToken, Token};
pub fn parse<'r, 't>(
tokenization: &'r Tokenization<'t>,
page_info: &'r PageInfo<'t>,
settings: &'r WikitextSettings,
) -> ParseOutcome<SyntaxTree<'t>>
where
'r: 't,
{
let UnstructuredParseResult {
result,
html_blocks,
code_blocks,
table_of_contents_depths,
footnotes,
has_footnote_block,
bibliographies,
} = parse_internal(page_info, settings, tokenization);
let mut toc_indexer = settings.id_indexer();
debug!("Finished paragraph gathering, matching on consumption");
match result {
Ok(ParseSuccess {
item: elements,
errors,
..
}) => {
debug!(
"Finished parsing, producing final syntax tree ({} errors)",
errors.len(),
);
let table_of_contents_depths = table_of_contents_depths
.into_iter()
.map(|(depth, contents)| (depth, (), contents));
let table_of_contents = process_depths((), table_of_contents_depths)
.into_iter()
.map(|(_, items)| build_toc_list_element(&mut toc_indexer, items))
.collect::<Vec<_>>();
let needs_footnote_block = !footnotes.is_empty() && !has_footnote_block;
SyntaxTree::from_element_result(
elements,
errors,
(html_blocks, code_blocks),
table_of_contents,
(footnotes, needs_footnote_block),
bibliographies,
tokenization.full_text().len(),
)
}
Err(error) => {
error!("Fatal error occurred at highest-level parsing: {error:#?}");
let wikitext = tokenization.full_text().inner();
let elements = vec![text!(wikitext)];
let errors = vec![error];
let table_of_contents = vec![];
let footnotes = vec![];
let needs_footnote_block = true;
let bibliographies = BibliographyList::new();
SyntaxTree::from_element_result(
elements,
errors,
(html_blocks, code_blocks),
table_of_contents,
(footnotes, needs_footnote_block),
bibliographies,
tokenization.full_text().len(),
)
}
}
}
pub fn parse_internal<'r, 't>(
page_info: &'r PageInfo<'t>,
settings: &'r WikitextSettings,
tokenization: &'r Tokenization<'t>,
) -> UnstructuredParseResult<'r, 't>
where
'r: 't,
{
let mut parser = Parser::new(tokenization, page_info, settings);
info!("Running parser on {} tokens", tokenization.tokens().len());
let result = gather_paragraphs(&mut parser, RULE_PAGE, NO_CLOSE_CONDITION);
let html_blocks = parser.remove_html_blocks();
let code_blocks = parser.remove_code_blocks();
let table_of_contents_depths = parser.remove_table_of_contents();
let footnotes = parser.remove_footnotes();
let has_footnote_block = parser.has_footnote_block();
let bibliographies = parser.remove_bibliographies();
UnstructuredParseResult {
result,
html_blocks,
code_blocks,
table_of_contents_depths,
footnotes,
has_footnote_block,
bibliographies,
}
}
fn build_toc_list_element(
incr: &mut Incrementer,
list: DepthList<(), String>,
) -> Element<'static> {
let build_item = |item| match item {
DepthItem::List(_, list) => ListItem::SubList {
element: Box::new(build_toc_list_element(incr, list)),
},
DepthItem::Item(name) => {
let anchor = match incr.next() {
None => Cow::Borrowed("javascript:;"),
Some(index) => Cow::Owned(format!("#toc{index}")),
};
let link = Element::Link {
ltype: LinkType::TableOfContents,
link: LinkLocation::Url(anchor),
label: LinkLabel::Text(Cow::Owned(name)),
target: None,
};
ListItem::Elements {
elements: vec![link],
attributes: AttributeMap::new(),
}
}
};
let items = list.into_iter().map(build_item).collect();
let attributes = AttributeMap::new();
Element::List {
ltype: ListType::Bullet,
items,
attributes,
}
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct UnstructuredParseResult<'r, 't> {
pub result: ParseResult<'r, 't, Vec<Element<'t>>>,
pub html_blocks: Vec<Cow<'t, str>>,
pub code_blocks: Vec<CodeBlock<'t>>,
pub table_of_contents_depths: Vec<(usize, String)>,
pub footnotes: Vec<Vec<Element<'t>>>,
pub has_footnote_block: bool,
pub bibliographies: BibliographyList<'t>,
}