kalosm_language/context/io/
mod.rs

1use crate::context::document::Document;
2use crate::context::document::IntoDocument;
3use crate::context::document::IntoDocuments;
4use std::path::PathBuf;
5use tokio::task::JoinSet;
6mod docx;
7pub use docx::*;
8mod html;
9pub use html::*;
10mod md;
11pub use md::*;
12mod pdf;
13pub use self::pdf::*;
14mod txt;
15pub use txt::*;
16
17use super::ExtractDocumentError;
18
19/// An error that can occur when reading a document from the file system.
20#[derive(Debug, thiserror::Error)]
21pub enum FsDocumentError<E = std::convert::Infallible> {
22    /// An error reading the file
23    #[error("Failed to read document: {0}")]
24    Read(#[from] std::io::Error),
25    /// An error decoding the file
26    #[error("Failed to decode document: {0}")]
27    Decode(E),
28    /// Wrong file type
29    #[error("Wrong file type")]
30    WrongFileType,
31}
32
33impl<E> FsDocumentError<E> {
34    fn map_decode<F, E2>(self, f: F) -> FsDocumentError<E2>
35    where
36        F: FnOnce(E) -> E2,
37    {
38        match self {
39            FsDocumentError::Read(err) => FsDocumentError::Read(err),
40            FsDocumentError::Decode(err) => FsDocumentError::Decode(f(err)),
41            FsDocumentError::WrongFileType => FsDocumentError::WrongFileType,
42        }
43    }
44}
45
46/// An error that can occur when decoding a text file.
47#[derive(Debug, thiserror::Error)]
48#[non_exhaustive]
49pub enum TextFileDecodeError {
50    /// An error extracting the document from the text file
51    #[error("Failed to extract document from text file: {0}")]
52    Extract(#[from] ExtractDocumentError),
53    /// An error decoding the pdf file
54    #[error("Failed to decode pdf file: {0}")]
55    Pdf(#[from] lopdf::Error),
56    /// An error reading the docx file
57    #[error("Failed to read docx file: {0}")]
58    Docx(#[from] docx_rs::ReaderError),
59}
60
61/// A document that can be read from the file system.
62///
63/// # Example
64/// ```rust, no_run
65/// use kalosm_language::prelude::*;
66/// use std::path::PathBuf;
67///
68/// #[tokio::main]
69/// async fn main() {
70///     let document = FsDocument::try_from(PathBuf::from("./documents"))
71///         .unwrap()
72///         .into_document()
73///         .await
74///         .unwrap();
75///     println!("document: {:?}", document);
76/// }
77/// ```
78#[derive(Debug, Clone)]
79pub enum FsDocument {
80    /// A docx document.
81    Docx(DocxDocument),
82    /// An html document.
83    Html(HtmlDocument),
84    /// A markdown document.
85    Md(MdDocument),
86    /// A pdf document.
87    Pdf(PdfDocument),
88    /// A text document.
89    Txt(TextDocument),
90}
91
92impl TryFrom<PathBuf> for FsDocument {
93    type Error = FsDocumentError;
94
95    fn try_from(path: PathBuf) -> Result<Self, Self::Error> {
96        if !path.is_file() {
97            return Err(std::io::Error::from(std::io::ErrorKind::NotFound).into());
98        }
99        match path.extension().and_then(|ext| ext.to_str()) {
100            Some("docx") => Ok(Self::Docx(DocxDocument::try_from(path)?)),
101            Some("html") => Ok(Self::Html(HtmlDocument::try_from(path)?)),
102            Some("md") => Ok(Self::Md(MdDocument::try_from(path)?)),
103            Some("pdf") => Ok(Self::Pdf(PdfDocument::try_from(path)?)),
104            Some("txt") => Ok(Self::Txt(TextDocument::try_from(path)?)),
105            _ => Err(FsDocumentError::WrongFileType),
106        }
107    }
108}
109
110impl IntoDocument for FsDocument {
111    type Error = FsDocumentError<TextFileDecodeError>;
112
113    async fn into_document(self) -> Result<Document, Self::Error> {
114        match self {
115            Self::Docx(docx) => docx
116                .into_document()
117                .await
118                .map_err(|err| err.map_decode(TextFileDecodeError::Docx)),
119            Self::Html(html) => html
120                .into_document()
121                .await
122                .map_err(|err| err.map_decode(TextFileDecodeError::Extract)),
123            Self::Md(md) => md
124                .into_document()
125                .await
126                .map_err(|err| err.map_decode(TextFileDecodeError::Extract)),
127            Self::Pdf(pdf) => pdf
128                .into_document()
129                .await
130                .map_err(|err| err.map_decode(|_| unreachable!())),
131            Self::Txt(txt) => txt
132                .into_document()
133                .await
134                .map_err(|err| err.map_decode(|_| unreachable!())),
135        }
136    }
137}
138
139/// A folder full of documents.
140///
141/// # Example
142/// ```rust, no_run
143/// # use kalosm::language::*;
144/// # use std::io::Write;
145/// # use std::path::PathBuf;
146/// #[tokio::main]
147/// async fn main() {
148///     // You can load a whole folder full of documents with the DocumentFolder source
149///     let folder = DocumentFolder::try_from(PathBuf::from("./documents")).unwrap();
150///     // Grab all the documents out of the folder
151///     let documents = folder.into_documents().await.unwrap();
152///
153///     // Then chunk the documents into sentences and use those chunks however you need
154///     let model = Bert::new().await.unwrap();
155///     let chunked = SemanticChunker::new()
156///         .chunk_batch(&documents, &model)
157///         .await
158///         .unwrap();
159///     println!("{:?}", chunked);
160/// }
161/// ```
162#[derive(Debug, Clone)]
163pub struct DocumentFolder {
164    path: PathBuf,
165}
166
167/// The path to a document folder was not a directory
168#[derive(Debug, thiserror::Error)]
169#[error("The path to a document folder was not a directory")]
170pub struct DocumentFolderNotDirectoryError;
171
172impl TryFrom<PathBuf> for DocumentFolder {
173    type Error = DocumentFolderNotDirectoryError;
174
175    fn try_from(path: PathBuf) -> Result<Self, Self::Error> {
176        if !path.is_dir() {
177            return Err(DocumentFolderNotDirectoryError);
178        }
179        Ok(Self { path })
180    }
181}
182
183impl IntoDocuments for DocumentFolder {
184    type Error = FsDocumentError<TextFileDecodeError>;
185
186    async fn into_documents(self) -> Result<Vec<Document>, Self::Error> {
187        let mut set = JoinSet::new();
188        self.start_into_documents(&mut set).await?;
189        let mut documents = Vec::new();
190        while let Some(join) = set.join_next().await {
191            let Ok(join) = join else {
192                continue;
193            };
194            documents.push(join?);
195        }
196        Ok(documents)
197    }
198}
199
200impl DocumentFolder {
201    /// Try to create a new document folder from a path.
202    ///
203    /// # Example
204    /// ```rust, no_run
205    /// use kalosm_language::prelude::*;
206    ///
207    /// let folder = DocumentFolder::new("./documents").unwrap();
208    /// ```
209    pub fn new(path: impl Into<PathBuf>) -> Result<Self, DocumentFolderNotDirectoryError> {
210        Self::try_from(path.into())
211    }
212
213    fn start_into_documents<'a>(
214        &'a self,
215        set: &'a mut JoinSet<Result<Document, FsDocumentError<TextFileDecodeError>>>,
216    ) -> std::pin::Pin<
217        Box<dyn std::future::Future<Output = Result<(), std::io::Error>> + Send + Sync + 'a>,
218    > {
219        Box::pin(async move {
220            let mut read_dir = tokio::fs::read_dir(&self.path).await?;
221            while let Some(entry) = read_dir.next_entry().await? {
222                let path = entry.path();
223                if path.is_dir() {
224                    if let Ok(folder) = DocumentFolder::try_from(path) {
225                        folder.start_into_documents(set).await?;
226                    }
227                } else if let Ok(document) = FsDocument::try_from(path) {
228                    set.spawn(document.into_document());
229                }
230            }
231            Ok(())
232        })
233    }
234}