use std::env;
use std::error::Error;
use std::fs::File;
use std::io::{BufRead, BufReader};
use dnn::{TextDetectionModel_EAST, TextRecognitionModel};
use opencv::core::{CommandLineParser, Point, Point2f, Scalar, Size, Vector};
use opencv::prelude::*;
use opencv::{core, dnn, highgui, imgproc, not_opencv_branch_34, opencv_branch_34, videoio};
use videoio::VideoCapture;
not_opencv_branch_34! {
use opencv::imgproc::{LINE_8, FONT_HERSHEY_SIMPLEX};
}
opencv_branch_34! {
use opencv::core::{LINE_8, FONT_HERSHEY_SIMPLEX};
}
type Result<T, E = Box<dyn Error>> = std::result::Result<T, E>;
const KEYS: &str = concat!(
"{ help h | | Print help message. }",
"{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}",
"{ detModel dmp | | Path to a binary .pb file contains trained detector network.}",
"{ width | 320 | Preprocess input image by resizing to a specific width. It should be multiple by 32. }",
"{ height | 320 | Preprocess input image by resizing to a specific height. It should be multiple by 32. }",
"{ thr | 0.5 | Confidence threshold. }",
"{ nms | 0.4 | Non-maximum suppression threshold. }",
"{ recModel rmp | | Path to a binary .onnx file contains trained CRNN text recognition model. ",
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}",
"{ RGBInput rgb |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }",
"{ vocabularyPath vp | alphabet_36.txt | Path to benchmarks for evaluation. ",
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}",
);
pub fn main() -> Result<()> {
let args = env::args().collect::<Vec<_>>();
let args = args.iter().map(|arg| arg.as_str()).collect::<Vec<_>>();
let mut parser = CommandLineParser::new(&args, KEYS)?;
parser.about(
"Use this script to run TensorFlow implementation (https://github.com/argman/EAST) of \
EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)",
)?;
if args.len() == 1 || parser.has("help")? {
parser.print_message()?;
return Ok(());
}
let conf_threshold = parser.get_f64("thr", true)? as f32;
let nms_threshold = parser.get_f64("nms", true)? as f32;
let width = parser.get_i32("width", true)?;
let height = parser.get_i32("height", true)?;
let imread_rgb = parser.get_bool("RGBInput", true)?;
let det_model_path = parser.get_str("detModel", true)?;
let rec_model_path = parser.get_str("recModel", true)?;
let voc_path = parser.get_str("vocabularyPath", true)?;
let mut detector = TextDetectionModel_EAST::from_file(&det_model_path, "")?;
detector
.set_confidence_threshold(conf_threshold)?
.set_nms_threshold(nms_threshold)?;
let mut recognizer = TextRecognitionModel::from_file(&rec_model_path, "")?;
let mut vocabulary = Vector::<String>::new();
let voc_file = BufReader::new(File::open(voc_path)?);
for voc_line in voc_file.lines() {
vocabulary.push(&voc_line?);
}
recognizer.set_vocabulary(&vocabulary)?.set_decode_type("CTC-greedy")?;
let rec_scale = 1. / 127.5;
let rec_mean = Scalar::from((127.5, 127.5, 127.5));
let rec_input_size = Size::new(100, 32);
recognizer.set_input_params(rec_scale, rec_input_size, rec_mean, false, false)?;
let det_scale = 1.;
let det_input_size = Size::new(width, height);
let det_mean = Scalar::from((123.68, 116.78, 103.94));
let swap_rb = true;
detector.set_input_params(det_scale, det_input_size, det_mean, swap_rb, false)?;
let mut cap = VideoCapture::new(0, videoio::CAP_ANY)?;
let open_success = VideoCapture::is_opened(&cap)?;
if !open_success {
panic!("Unable to open default camera!");
}
let win_name = "EAST: An Efficient and Accurate Scene Text Detector";
let mut frame = Mat::default();
while highgui::wait_key(1)? < 0 {
cap.read(&mut frame)?;
if frame.empty() {
highgui::wait_key(0)?;
break;
}
println!("Frame size: {:?}", frame.size()?);
let mut det_results = Vector::new();
detector.detect(&frame, &mut det_results)?;
if !det_results.is_empty() {
let rec_input = if !imread_rgb {
let mut rec_input = Mat::default();
imgproc::cvt_color_def(&frame, &mut rec_input, imgproc::COLOR_BGR2GRAY)?;
Some(rec_input)
} else {
None
};
let mut contours = Vector::<Vector<Point>>::new();
for quadrangle in &det_results {
let mut quadrangle_2f = Vector::<Point2f>::new();
for pt in &quadrangle {
quadrangle_2f.push(Point2f::new(pt.x as f32, pt.y as f32))
}
let cropped = four_points_transform(rec_input.as_ref().unwrap_or(&frame), quadrangle_2f.as_slice().try_into()?)?;
let recognition_result = recognizer.recognize(&cropped)?;
println!("Recognition result: {recognition_result}");
imgproc::put_text(
&mut frame,
&recognition_result,
quadrangle.get(3)?,
FONT_HERSHEY_SIMPLEX,
1.5,
(0, 0, 255).into(),
2,
LINE_8,
false,
)?;
contours.push(quadrangle);
}
imgproc::polylines(&mut frame, &contours, true, (0, 255, 0).into(), 2, LINE_8, 0)?;
}
let mut big_frame = Mat::default();
imgproc::resize(&frame, &mut big_frame, Size::default(), 3., 3., imgproc::INTER_NEAREST)?;
highgui::imshow(win_name, &big_frame)?;
}
Ok(())
}
fn four_points_transform(frame: &Mat, vertices: &[Point2f; 4]) -> Result<Mat> {
let output_size = Size::new(100, 32);
let target_vertices = [
Point2f::new(0., (output_size.height - 1) as f32),
Point2f::new(0., 0.),
Point2f::new((output_size.width - 1) as f32, 0.),
Point2f::new((output_size.width - 1) as f32, (output_size.height - 1) as f32),
];
let rotation_matrix = imgproc::get_perspective_transform_slice(vertices, &target_vertices, core::DECOMP_LU)?;
let mut out = Mat::default();
imgproc::warp_perspective_def(frame, &mut out, &rotation_matrix, output_size)?;
Ok(out)
}