use crate::tokenizer::CellTextSpecialTokens;
use crate::tokenizer::TableSpecialTokens;
use crate::tokenizer::Tokenizer;
use pyo3::prelude::*;
use std::str::FromStr;
#[derive(Debug, Clone, Copy)]
pub enum State {
Idle,
ReadTable,
ReadTableCaption,
ReadCol,
ReadRow,
}
#[pyclass]
#[derive(Debug, Clone)]
pub enum Event {
TableStart {},
TableStyle { text: String },
TableEnd {},
ColStart { cell_type: CellType },
ColStyle { text: String },
ColEnd { text: String },
TableCaptionStart {},
TableCaption { text: String },
RowStart {},
RowStyle { text: String },
RowEnd {},
}
#[pyclass]
#[derive(Debug, Clone)]
pub enum CellType {
HeaderCell,
DataCell,
}
#[pyclass]
#[derive(Debug)]
pub struct WikitextTableParser {
state: State,
#[pyo3(get, set)]
event_log_queue: Vec<Event>,
#[pyo3(get, set)]
tokens: Vec<String>,
text_buffer: String,
table_tokenizer: Tokenizer,
cell_tokenizer: Tokenizer,
clean_cell_text: bool,
}
impl Iterator for WikitextTableParser {
type Item = Event;
fn next(&mut self) -> Option<Event> {
while self.tokens.len() > 0 {
self.step();
}
if self.event_log_queue.len() > 0 {
let event = self.event_log_queue.remove(0);
return Some(event);
}
return None;
}
}
#[pymethods]
impl WikitextTableParser {
#[new]
pub fn new(
table_tokenizer: Tokenizer,
cell_tokenizer: Tokenizer,
wikitext_table: &str,
clean_cell_text: bool,
) -> Self {
let text_for_parse: String = String::from("\n") + wikitext_table;
let parser = WikitextTableParser {
state: State::Idle,
tokens: table_tokenizer.tokenize(&text_for_parse),
event_log_queue: Vec::new(),
text_buffer: String::from(""),
table_tokenizer: table_tokenizer,
cell_tokenizer: cell_tokenizer,
clean_cell_text: clean_cell_text,
};
return parser;
}
fn append_to_text_buffer(&mut self, s: &str) {
let token = TableSpecialTokens::from_str(s);
match token {
Ok(_) => {
}
Err(_) => {
self.text_buffer += s;
}
}
}
fn clear_text_buffer(&mut self) {
self.text_buffer = String::from("")
}
fn split_cell_style_and_text(&self, cell_text: String) -> Vec<String> {
let cell_tokens = self.cell_tokenizer.tokenize(&cell_text);
let mut style = String::new();
let mut cell_text = String::new();
let mut temp = String::new();
let mut already_match_style_end = false;
let mut in_closure = false;
for token in cell_tokens {
match CellTextSpecialTokens::from_str(token.as_str()) {
Ok(cell_text_sp_token) => match cell_text_sp_token {
CellTextSpecialTokens::Sep => {
if !in_closure {
already_match_style_end = true;
}
}
CellTextSpecialTokens::LinkStart => in_closure = true,
CellTextSpecialTokens::LinkEnd => in_closure = false,
CellTextSpecialTokens::TemplateStart => in_closure = true,
CellTextSpecialTokens::TemplateEnd => in_closure = false,
_ => {}
},
Err(_) => {}
}
temp += &token;
if already_match_style_end && style.len() == 0 {
style = temp.clone();
temp = String::new();
}
}
cell_text = temp;
return vec![style, cell_text];
}
fn get_text_buffer_data(&self) -> String {
let cell_raw_text = self.text_buffer.clone().trim().to_string();
let split_texts = self.split_cell_style_and_text(cell_raw_text);
let style: String = split_texts[0].clone();
let cell_text = split_texts[1].clone();
return cell_text;
}
fn get_style_text_buffer_data(&self) -> String {
let cell_raw_text = self.text_buffer.clone().trim().to_string();
let split_texts = self.split_cell_style_and_text(cell_raw_text);
let style = split_texts[0].clone();
let cell_text = split_texts[1].clone();
return style;
}
fn step(&mut self) {
let token = self.tokens.remove(0);
match self.state {
State::Idle => {
if &token == TableSpecialTokens::TableStart.as_ref() {
self.transition(Event::TableStart {})
}
}
State::ReadTable => {
self.append_to_text_buffer(&token);
if &token == TableSpecialTokens::TableCaption.as_ref() {
self.transition(Event::TableStyle {
text: self.get_text_buffer_data(),
});
self.clear_text_buffer();
self.transition(Event::TableCaptionStart {});
} else if &token == TableSpecialTokens::TableRow.as_ref() {
self.transition(Event::TableStyle {
text: self.get_style_text_buffer_data(),
});
self.clear_text_buffer();
self.transition(Event::RowStart {});
} else if &token == TableSpecialTokens::TableHeaderCell.as_ref() {
self.transition(Event::TableStyle {
text: self.get_text_buffer_data(),
});
self.clear_text_buffer();
self.transition(Event::RowStart {});
}
else if &token == TableSpecialTokens::TableEnd.as_ref() {
self.transition(Event::TableEnd {});
}
}
State::ReadTableCaption => {
self.append_to_text_buffer(&token);
if &token == TableSpecialTokens::TableRow.as_ref() {
self.transition(Event::TableCaption {
text: self.get_text_buffer_data(),
});
self.clear_text_buffer();
self.transition(Event::RowStart {});
}
else if &token == TableSpecialTokens::TableHeaderCell.as_ref() {
self.transition(Event::TableCaption {
text: self.get_text_buffer_data(),
});
self.clear_text_buffer();
self.transition(Event::RowStart {});
self.transition(Event::RowStyle {
text: String::from(""),
});
self.transition(Event::ColStart {
cell_type: CellType::HeaderCell,
});
}
}
State::ReadRow => {
self.append_to_text_buffer(&token);
if &token == TableSpecialTokens::TableDataCell.as_ref()
|| &token == TableSpecialTokens::TableDataCell2.as_ref()
{
self.transition(Event::RowStyle {
text: self.get_text_buffer_data(),
});
self.clear_text_buffer();
self.transition(Event::ColStart {
cell_type: CellType::DataCell,
});
} else if &token == TableSpecialTokens::TableHeaderCell.as_ref()
|| &token == TableSpecialTokens::TableHeaderCell2.as_ref()
{
self.transition(Event::RowStyle {
text: self.get_text_buffer_data(),
});
self.clear_text_buffer();
self.transition(Event::ColStart {
cell_type: CellType::HeaderCell,
});
} else if &token == TableSpecialTokens::TableEnd.as_ref() {
self.transition(Event::RowEnd {});
self.transition(Event::TableEnd {});
self.clear_text_buffer();
}
}
State::ReadCol => {
self.append_to_text_buffer(&token);
if &token == TableSpecialTokens::TableDataCell.as_ref()
|| &token == TableSpecialTokens::TableDataCell2.as_ref()
{
self.transition(Event::ColStyle {
text: self.get_style_text_buffer_data(),
});
self.transition(Event::ColEnd {
text: self.get_text_buffer_data(),
});
self.clear_text_buffer();
self.transition(Event::ColStart {
cell_type: CellType::DataCell,
});
}
else if &token == TableSpecialTokens::TableHeaderCell.as_ref()
|| &token == TableSpecialTokens::TableHeaderCell2.as_ref()
{
self.transition(Event::ColStyle {
text: self.get_style_text_buffer_data(),
});
self.transition(Event::ColEnd {
text: self.get_text_buffer_data(),
});
self.clear_text_buffer();
self.transition(Event::ColStart {
cell_type: CellType::HeaderCell,
});
} else if &token == TableSpecialTokens::TableRow.as_ref() {
self.transition(Event::ColStyle {
text: self.get_style_text_buffer_data(),
});
self.transition(Event::ColEnd {
text: self.get_text_buffer_data(),
});
self.clear_text_buffer();
self.transition(Event::RowEnd {});
self.transition(Event::RowStart {});
} else if &token == TableSpecialTokens::TableEnd.as_ref() {
self.transition(Event::ColStyle {
text: self.get_style_text_buffer_data(),
});
self.transition(Event::ColEnd {
text: self.get_text_buffer_data(),
});
self.clear_text_buffer();
self.transition(Event::RowEnd {});
self.transition(Event::TableEnd {});
}
}
}
}
fn transition(&mut self, event: Event) {
self.event_log_queue.push(event.clone());
match (self.state, event) {
(State::Idle, Event::TableStart {}) => self.state = State::ReadTable,
(State::ReadTableCaption, Event::TableCaption { text }) => {
self.state = State::ReadTable
}
(State::ReadTable, Event::TableCaptionStart {}) => self.state = State::ReadTableCaption,
(State::ReadTable, Event::TableEnd {}) => {
self.state = State::Idle;
}
(State::ReadTable, Event::RowStart {}) => self.state = State::ReadRow,
(State::ReadRow, Event::ColStart { cell_type }) => self.state = State::ReadCol,
(State::ReadCol, Event::ColStyle { text }) => {}
(State::ReadCol, Event::ColEnd { text }) => self.state = State::ReadCol,
(State::ReadCol, Event::RowStart {}) => self.state = State::ReadRow,
(_, _) => {}
}
}
}