use std::collections::HashMap;
use std::io::Cursor;
use crate::graphics::Color;
use crate::semantic::{EntityType, SemanticEntity};
use crate::text::Font;
#[derive(Debug, Clone)]
pub enum RedactionStyle {
BlackBox,
Placeholder(String),
}
impl Default for RedactionStyle {
fn default() -> Self {
Self::BlackBox
}
}
#[derive(Debug, Clone)]
pub struct RedactionConfig {
pub entity_types: Vec<EntityType>,
pub style: RedactionStyle,
}
impl Default for RedactionConfig {
fn default() -> Self {
Self {
entity_types: Vec::new(),
style: RedactionStyle::BlackBox,
}
}
}
impl RedactionConfig {
pub fn new() -> Self {
Self::default()
}
pub fn with_types(mut self, types: Vec<EntityType>) -> Self {
self.entity_types = types;
self
}
pub fn with_style(mut self, style: RedactionStyle) -> Self {
self.style = style;
self
}
}
#[derive(Debug, Clone)]
pub struct RedactionEntry {
pub entity_id: String,
pub entity_type: EntityType,
pub page: u32,
}
#[derive(Debug)]
pub struct RedactionReport {
entries: Vec<RedactionEntry>,
}
impl RedactionReport {
pub fn redacted_count(&self) -> usize {
self.entries.len()
}
pub fn by_type(&self, entity_type: &EntityType) -> Vec<&RedactionEntry> {
self.entries
.iter()
.filter(|e| &e.entity_type == entity_type)
.collect()
}
pub fn pages_affected(&self) -> Vec<u32> {
let mut pages: Vec<u32> = self.entries.iter().map(|e| e.page).collect();
pages.sort();
pages.dedup();
pages
}
pub fn entries(&self) -> &[RedactionEntry] {
&self.entries
}
}
#[derive(Debug, thiserror::Error)]
pub enum SemanticRedactorError {
#[error("parse failed: {0}")]
ParseFailed(String),
#[error("page reconstruction failed: {0}")]
PageReconstructionFailed(String),
#[error("write failed: {0}")]
WriteFailed(String),
}
pub type SemanticRedactorResult<T> = Result<T, SemanticRedactorError>;
pub struct SemanticRedactor;
impl SemanticRedactor {
pub fn redact(
pdf_bytes: &[u8],
entities: &[SemanticEntity],
config: RedactionConfig,
) -> SemanticRedactorResult<(Vec<u8>, RedactionReport)> {
let to_redact: Vec<&SemanticEntity> = if config.entity_types.is_empty() {
Vec::new()
} else {
entities
.iter()
.filter(|e| config.entity_types.contains(&e.entity_type))
.collect()
};
if to_redact.is_empty() {
return Ok((
pdf_bytes.to_vec(),
RedactionReport {
entries: Vec::new(),
},
));
}
let mut by_page: HashMap<u32, Vec<&SemanticEntity>> = HashMap::new();
for entity in &to_redact {
by_page.entry(entity.bounds.page).or_default().push(entity);
}
let cursor = Cursor::new(pdf_bytes);
let reader = crate::parser::PdfReader::new(cursor)
.map_err(|e| SemanticRedactorError::ParseFailed(e.to_string()))?;
let document = reader.into_document();
let page_count = document
.page_count()
.map_err(|e| SemanticRedactorError::PageReconstructionFailed(e.to_string()))?;
let mut output_doc = crate::document::Document::new();
let mut report_entries = Vec::new();
for page_idx in 0..page_count {
let parsed_page = document
.get_page(page_idx)
.map_err(|e| SemanticRedactorError::PageReconstructionFailed(e.to_string()))?;
let mut page = crate::page::Page::from_parsed_with_content(&parsed_page, &document)
.map_err(|e| SemanticRedactorError::PageReconstructionFailed(e.to_string()))?;
let page_num_1indexed = (page_idx + 1) as u32;
if let Some(page_entities) = by_page.get(&page_num_1indexed) {
for entity in page_entities {
let bbox = &entity.bounds;
page.graphics()
.set_fill_color(Color::black())
.rect(
bbox.x as f64,
bbox.y as f64,
bbox.width as f64,
bbox.height as f64,
)
.fill();
if let RedactionStyle::Placeholder(ref text) = config.style {
let font_size = (bbox.height as f64 * 0.6).min(10.0).max(4.0);
let text_ctx = page.text();
text_ctx.set_font(Font::Helvetica, font_size);
text_ctx.set_fill_color(Color::white());
text_ctx.at(
bbox.x as f64 + 2.0,
bbox.y as f64 + (bbox.height as f64 - font_size) / 2.0,
);
let _ = text_ctx.write(text);
}
report_entries.push(RedactionEntry {
entity_id: entity.id.clone(),
entity_type: entity.entity_type.clone(),
page: page_num_1indexed,
});
}
}
output_doc.add_page(page);
}
let output_bytes = output_doc
.to_bytes()
.map_err(|e| SemanticRedactorError::WriteFailed(e.to_string()))?;
Ok((
output_bytes,
RedactionReport {
entries: report_entries,
},
))
}
}