use crate::extractors::rtf::encoding::{decode_windows_1252, parse_hex_byte, parse_rtf_control_word};
use crate::extractors::rtf::formatting::normalize_whitespace;
use crate::extractors::rtf::images::extract_image_metadata;
use crate::extractors::rtf::tables::TableState;
use crate::types::Table;
const SKIP_DESTINATIONS: &[&str] = &[
"fonttbl",
"colortbl",
"stylesheet",
"info",
"listtable",
"listoverridetable",
"generator",
"filetbl",
"revtbl",
"rsidtbl",
"xmlnstbl",
"mmathPr",
"themedata",
"colorschememapping",
"datastore",
"latentstyles",
"datafield",
"fldinst",
"objdata",
"objclass",
"panose",
"bkmkstart",
"bkmkend",
"field",
"wgrffmtfilter",
"fcharset",
"pgdsctbl",
];
pub fn extract_text_from_rtf(content: &str, plain: bool) -> (String, Vec<Table>) {
let mut result = String::new();
let mut chars = content.chars().peekable();
let mut tables: Vec<Table> = Vec::new();
let mut table_state: Option<TableState> = None;
let mut group_depth: i32 = 0;
let mut skip_depth: i32 = 0;
let mut ignorable_pending = false;
let mut expect_destination = false;
let ensure_table = |table_state: &mut Option<TableState>| {
if table_state.is_none() {
*table_state = Some(TableState::new());
}
};
let finalize_table = move |state_opt: &mut Option<TableState>, tables: &mut Vec<Table>| {
if let Some(state) = state_opt.take()
&& let Some(table) = state.finalize_with_format(plain)
{
tables.push(table);
}
};
while let Some(ch) = chars.next() {
match ch {
'{' => {
group_depth += 1;
expect_destination = true;
}
'}' => {
group_depth -= 1;
expect_destination = false;
ignorable_pending = false;
if skip_depth > 0 && group_depth < skip_depth {
skip_depth = 0;
}
if skip_depth == 0 && !result.is_empty() && !result.ends_with(' ') && !result.ends_with('\n') {
result.push(' ');
}
}
'\\' => {
if let Some(&next_ch) = chars.peek() {
match next_ch {
'\\' | '{' | '}' => {
chars.next();
expect_destination = false;
if skip_depth > 0 {
continue;
}
result.push(next_ch);
}
'\'' => {
chars.next();
expect_destination = false;
let hex1 = chars.next();
let hex2 = chars.next();
if skip_depth > 0 {
continue;
}
if let (Some(h1), Some(h2)) = (hex1, hex2)
&& let Some(byte) = parse_hex_byte(h1, h2)
{
let decoded = decode_windows_1252(byte);
result.push(decoded);
if let Some(state) = table_state.as_mut()
&& state.in_row
{
state.current_cell.push(decoded);
}
}
}
'*' => {
chars.next();
ignorable_pending = true;
}
_ => {
let (control_word, _param) = parse_rtf_control_word(&mut chars);
if expect_destination || ignorable_pending {
expect_destination = false;
if ignorable_pending {
ignorable_pending = false;
if skip_depth == 0 {
skip_depth = group_depth;
}
continue;
}
if SKIP_DESTINATIONS.contains(&control_word.as_str()) {
if skip_depth == 0 {
skip_depth = group_depth;
}
continue;
}
}
if skip_depth > 0 {
continue;
}
handle_control_word(
&control_word,
_param,
&mut chars,
&mut result,
&mut table_state,
&mut tables,
&ensure_table,
&finalize_table,
plain,
);
}
}
}
}
'\n' | '\r' => {
}
' ' | '\t' => {
if skip_depth > 0 {
continue;
}
if !result.is_empty() && !result.ends_with(' ') && !result.ends_with('\n') {
result.push(' ');
}
if let Some(state) = table_state.as_mut()
&& state.in_row
&& !state.current_cell.ends_with(' ')
{
state.current_cell.push(' ');
}
}
_ => {
expect_destination = false;
if skip_depth > 0 {
continue;
}
if let Some(state) = table_state.as_ref()
&& !state.in_row
&& !state.rows.is_empty()
{
finalize_table(&mut table_state, &mut tables);
}
result.push(ch);
if let Some(state) = table_state.as_mut()
&& state.in_row
{
state.current_cell.push(ch);
}
}
}
}
if table_state.is_some() {
finalize_table(&mut table_state, &mut tables);
}
(normalize_whitespace(&result), tables)
}
#[allow(clippy::too_many_arguments)]
fn handle_control_word(
control_word: &str,
param: Option<i32>,
chars: &mut std::iter::Peekable<std::str::Chars>,
result: &mut String,
table_state: &mut Option<TableState>,
tables: &mut Vec<Table>,
ensure_table: &dyn Fn(&mut Option<TableState>),
finalize_table: &dyn Fn(&mut Option<TableState>, &mut Vec<Table>),
plain: bool,
) {
match control_word {
"u" => {
if let Some(code_num) = param {
let code_u = if code_num < 0 {
(code_num + 65536) as u32
} else {
code_num as u32
};
if let Some(c) = char::from_u32(code_u) {
result.push(c);
if let Some(state) = table_state.as_mut()
&& state.in_row
{
state.current_cell.push(c);
}
}
if let Some(&next) = chars.peek()
&& next != '\\'
&& next != '{'
&& next != '}'
{
chars.next();
}
}
}
"pict" => {
let image_metadata = extract_image_metadata(chars);
if !image_metadata.is_empty() && !plain {
result.push('!');
result.push('[');
result.push_str("image");
result.push(']');
result.push('(');
result.push_str(&image_metadata);
result.push(')');
result.push(' ');
if let Some(state) = table_state.as_mut()
&& state.in_row
{
state.current_cell.push('!');
state.current_cell.push('[');
state.current_cell.push_str("image");
state.current_cell.push(']');
state.current_cell.push('(');
state.current_cell.push_str(&image_metadata);
state.current_cell.push(')');
state.current_cell.push(' ');
}
}
}
"par" | "line" => {
if table_state.is_some() {
finalize_table(table_state, tables);
}
if !result.is_empty() && !result.ends_with('\n') {
result.push('\n');
result.push('\n');
}
}
"tab" => {
result.push('\t');
if let Some(state) = table_state.as_mut()
&& state.in_row
{
state.current_cell.push('\t');
}
}
"bullet" => {
result.push('\u{2022}');
}
"lquote" => {
result.push('\u{2018}');
}
"rquote" => {
result.push('\u{2019}');
}
"ldblquote" => {
result.push('\u{201C}');
}
"rdblquote" => {
result.push('\u{201D}');
}
"endash" => {
result.push('\u{2013}');
}
"emdash" => {
result.push('\u{2014}');
}
"trowd" => {
ensure_table(table_state);
if let Some(state) = table_state.as_mut() {
state.start_row();
}
if !result.is_empty() && !result.ends_with('\n') {
result.push('\n');
}
if !plain && !result.ends_with('|') {
result.push('|');
result.push(' ');
}
}
"cell" => {
if let Some(state) = table_state.as_mut()
&& state.in_row
{
state.push_cell();
}
if plain {
if !result.ends_with('|') && !result.ends_with('\n') && !result.is_empty() {
result.push('|');
}
} else {
if !result.ends_with('|') {
if !result.ends_with(' ') && !result.is_empty() {
result.push(' ');
}
result.push('|');
}
if !result.ends_with(' ') {
result.push(' ');
}
}
}
"row" => {
ensure_table(table_state);
if let Some(state) = table_state.as_mut()
&& (state.in_row || !state.current_cell.is_empty())
{
state.push_row();
}
if !plain && !result.ends_with('|') {
result.push('|');
}
if !result.ends_with('\n') {
result.push('\n');
}
}
_ => {}
}
}