kalosm_language/context/io/
mod.rs1use crate::context::document::Document;
2use crate::context::document::IntoDocument;
3use crate::context::document::IntoDocuments;
4use std::path::PathBuf;
5use tokio::task::JoinSet;
6mod docx;
7pub use docx::*;
8mod html;
9pub use html::*;
10mod md;
11pub use md::*;
12mod pdf;
13pub use self::pdf::*;
14mod txt;
15pub use txt::*;
16
17use super::ExtractDocumentError;
18
19#[derive(Debug, thiserror::Error)]
21pub enum FsDocumentError<E = std::convert::Infallible> {
22 #[error("Failed to read document: {0}")]
24 Read(#[from] std::io::Error),
25 #[error("Failed to decode document: {0}")]
27 Decode(E),
28 #[error("Wrong file type")]
30 WrongFileType,
31}
32
33impl<E> FsDocumentError<E> {
34 fn map_decode<F, E2>(self, f: F) -> FsDocumentError<E2>
35 where
36 F: FnOnce(E) -> E2,
37 {
38 match self {
39 FsDocumentError::Read(err) => FsDocumentError::Read(err),
40 FsDocumentError::Decode(err) => FsDocumentError::Decode(f(err)),
41 FsDocumentError::WrongFileType => FsDocumentError::WrongFileType,
42 }
43 }
44}
45
46#[derive(Debug, thiserror::Error)]
48#[non_exhaustive]
49pub enum TextFileDecodeError {
50 #[error("Failed to extract document from text file: {0}")]
52 Extract(#[from] ExtractDocumentError),
53 #[error("Failed to decode pdf file: {0}")]
55 Pdf(#[from] lopdf::Error),
56 #[error("Failed to read docx file: {0}")]
58 Docx(#[from] docx_rs::ReaderError),
59}
60
61#[derive(Debug, Clone)]
79pub enum FsDocument {
80 Docx(DocxDocument),
82 Html(HtmlDocument),
84 Md(MdDocument),
86 Pdf(PdfDocument),
88 Txt(TextDocument),
90}
91
92impl TryFrom<PathBuf> for FsDocument {
93 type Error = FsDocumentError;
94
95 fn try_from(path: PathBuf) -> Result<Self, Self::Error> {
96 if !path.is_file() {
97 return Err(std::io::Error::from(std::io::ErrorKind::NotFound).into());
98 }
99 match path.extension().and_then(|ext| ext.to_str()) {
100 Some("docx") => Ok(Self::Docx(DocxDocument::try_from(path)?)),
101 Some("html") => Ok(Self::Html(HtmlDocument::try_from(path)?)),
102 Some("md") => Ok(Self::Md(MdDocument::try_from(path)?)),
103 Some("pdf") => Ok(Self::Pdf(PdfDocument::try_from(path)?)),
104 Some("txt") => Ok(Self::Txt(TextDocument::try_from(path)?)),
105 _ => Err(FsDocumentError::WrongFileType),
106 }
107 }
108}
109
110impl IntoDocument for FsDocument {
111 type Error = FsDocumentError<TextFileDecodeError>;
112
113 async fn into_document(self) -> Result<Document, Self::Error> {
114 match self {
115 Self::Docx(docx) => docx
116 .into_document()
117 .await
118 .map_err(|err| err.map_decode(TextFileDecodeError::Docx)),
119 Self::Html(html) => html
120 .into_document()
121 .await
122 .map_err(|err| err.map_decode(TextFileDecodeError::Extract)),
123 Self::Md(md) => md
124 .into_document()
125 .await
126 .map_err(|err| err.map_decode(TextFileDecodeError::Extract)),
127 Self::Pdf(pdf) => pdf
128 .into_document()
129 .await
130 .map_err(|err| err.map_decode(|_| unreachable!())),
131 Self::Txt(txt) => txt
132 .into_document()
133 .await
134 .map_err(|err| err.map_decode(|_| unreachable!())),
135 }
136 }
137}
138
139#[derive(Debug, Clone)]
163pub struct DocumentFolder {
164 path: PathBuf,
165}
166
167#[derive(Debug, thiserror::Error)]
169#[error("The path to a document folder was not a directory")]
170pub struct DocumentFolderNotDirectoryError;
171
172impl TryFrom<PathBuf> for DocumentFolder {
173 type Error = DocumentFolderNotDirectoryError;
174
175 fn try_from(path: PathBuf) -> Result<Self, Self::Error> {
176 if !path.is_dir() {
177 return Err(DocumentFolderNotDirectoryError);
178 }
179 Ok(Self { path })
180 }
181}
182
183impl IntoDocuments for DocumentFolder {
184 type Error = FsDocumentError<TextFileDecodeError>;
185
186 async fn into_documents(self) -> Result<Vec<Document>, Self::Error> {
187 let mut set = JoinSet::new();
188 self.start_into_documents(&mut set).await?;
189 let mut documents = Vec::new();
190 while let Some(join) = set.join_next().await {
191 let Ok(join) = join else {
192 continue;
193 };
194 documents.push(join?);
195 }
196 Ok(documents)
197 }
198}
199
200impl DocumentFolder {
201 pub fn new(path: impl Into<PathBuf>) -> Result<Self, DocumentFolderNotDirectoryError> {
210 Self::try_from(path.into())
211 }
212
213 fn start_into_documents<'a>(
214 &'a self,
215 set: &'a mut JoinSet<Result<Document, FsDocumentError<TextFileDecodeError>>>,
216 ) -> std::pin::Pin<
217 Box<dyn std::future::Future<Output = Result<(), std::io::Error>> + Send + Sync + 'a>,
218 > {
219 Box::pin(async move {
220 let mut read_dir = tokio::fs::read_dir(&self.path).await?;
221 while let Some(entry) = read_dir.next_entry().await? {
222 let path = entry.path();
223 if path.is_dir() {
224 if let Ok(folder) = DocumentFolder::try_from(path) {
225 folder.start_into_documents(set).await?;
226 }
227 } else if let Ok(document) = FsDocument::try_from(path) {
228 set.spawn(document.into_document());
229 }
230 }
231 Ok(())
232 })
233 }
234}