use std::{io::Cursor, process::Stdio};
use futures::StreamExt;
use oris_runtime::{
document_loaders::{HtmlLoader, Loader},
schemas::Document,
text_splitter::{PlainTextSplitter, PlainTextSplitterOptions, TextSplitter},
tools::{Text2SpeechOpenAI, Tool},
};
use tokio::{io::AsyncReadExt, process::Command};
use url::Url;
#[tokio::main]
async fn main() {
let url = "https://en.m.wikivoyage.org/wiki/Seoul";
let output_path = "output.mp3";
println!("Fetching URL: {}\n", url);
let html = reqwest::get(url).await.unwrap().text().await.unwrap();
let html_loader = HtmlLoader::new(Cursor::new(html), Url::parse(url).unwrap());
let documents: Vec<Document> = html_loader
.load()
.await
.unwrap()
.map(|x| x.unwrap())
.collect()
.await;
let splitter = PlainTextSplitter::new(
PlainTextSplitterOptions::default()
.with_chunk_size(3000)
.with_chunk_overlap(0)
.with_trim_chunks(true),
);
let text_chunks = splitter
.split_documents(&documents)
.await
.unwrap()
.into_iter()
.take(2) .collect::<Vec<Document>>();
for (i, chunk) in text_chunks.iter().enumerate() {
println!(
"Processing chunk {} of {} with chunk size {}: \n{}\n",
i,
text_chunks.len(),
chunk.page_content.len(),
&chunk.page_content
);
let openai = Text2SpeechOpenAI::default().with_path(format!("chunk_{}.mp3", i));
let path = openai.call(&chunk.page_content).await.unwrap();
let path = std::path::Path::new(&path).canonicalize().unwrap();
println!("Chunk file saved at: {:?}\n\n", path);
}
let mut args = vec![];
let chunks_paths_list = text_chunks
.iter()
.enumerate()
.map(|(i, _)| format!("chunk_{}.mp3", i))
.collect::<Vec<String>>()
.join("|");
args.extend_from_slice(&[
"-hide_banner".into(),
"-i".into(),
format!("concat:{}", &chunks_paths_list),
"-acodec".into(),
"copy".into(),
"-y".into(), output_path.into(),
]);
println!(
"Merging {} audio chunks using: ffmpeg {}\n",
text_chunks.len(),
&args.join(" ")
);
let mut child = Command::new("ffmpeg")
.args(args)
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.expect("Failed to start ffmpeg process.");
let mut stdout = child.stdout.take().expect("Failed to open stdout");
let mut stderr = child.stderr.take().expect("Failed to open stderr");
let stdout_handle = tokio::spawn(async move {
let mut buffer = vec![0; 1024];
while let Ok(size) = stdout.read(&mut buffer).await {
if size == 0 {
break;
}
let output = String::from_utf8_lossy(&buffer[..size]);
print!("FFmpeg STDOUT: {}", output);
}
});
let stderr_handle = tokio::spawn(async move {
let mut buffer = vec![0; 1024];
while let Ok(size) = stderr.read(&mut buffer).await {
if size == 0 {
break;
}
let error = String::from_utf8_lossy(&buffer[..size]);
eprint!("FFmpeg STDERR: {}", error);
}
});
let ffmpeg_exit_status = child.wait().await.unwrap();
stdout_handle.await.unwrap();
stderr_handle.await.unwrap();
println!(
"FFmpeg process finished with exit status {}",
ffmpeg_exit_status
);
println!("Cleaning up intermediate audio chunk files...");
for (i, _) in text_chunks.iter().enumerate() {
let path = std::path::Path::new(&format!("chunk_{}.mp3", i))
.canonicalize()
.unwrap();
tokio::fs::remove_file(path).await.unwrap();
}
println!("Cleaning up intermediate audio chunk files complete.");
let path = std::path::Path::new(&output_path).canonicalize().unwrap();
println!("Final audio saved at: {:?}", path);
}