Struct PartitionParameters

Source

pub struct PartitionParameters {Show 22 fields
    pub coordinates: bool,
    pub encoding: Option<String>,
    pub extract_image_block_types: Vec<String>,
    pub gz_uncompressed_content_type: Option<String>,
    pub hi_res_model_name: Option<String>,
    pub include_page_breaks: bool,
    pub languages: Option<Vec<String>>,
    pub output_format: String,
    pub skip_infer_table_types: Vec<String>,
    pub starting_page_number: Option<i32>,
    pub strategy: Strategy,
    pub unique_element_ids: bool,
    pub xml_keep_tags: bool,
    pub chunking_strategy: Option<ChunkingStrategy>,
    pub combine_under_n_chars: Option<i32>,
    pub include_orig_elements: bool,
    pub max_characters: Option<i32>,
    pub multipage_sections: bool,
    pub new_after_n_chars: Option<i32>,
    pub overlap: i32,
    pub overlap_all: bool,
    pub similarity_threshold: Option<f64>,
}

Fields§

§coordinates: bool

If True, return coordinates for each element extracted via OCR. Default: False.

§encoding: Option<String>

The encoding method used to decode the text input. Default: utf-8

§extract_image_block_types: Vec<String>

The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields. Default: [].

§gz_uncompressed_content_type: Option<String>

If file is gzipped, use this content type after unzipping.

§hi_res_model_name: Option<String>

The name of the inference model used when strategy is hi_res

§include_page_breaks: bool

If true, the output will include page breaks if the filetype supports it. Default: false

§languages: Option<Vec<String>>

The languages present in the document, for use in partitioning and/or OCR. See the Tesseract documentation for a full list of languages. Default: [].

§output_format: String

The format of the response. Supported formats are application/json and text/csv. Default: application/json.

§skip_infer_table_types: Vec<String>

The document types that you want to skip table extraction with. Default: [].

§starting_page_number: Option<i32>

When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27.

§strategy: Strategy

The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto

§unique_element_ids: bool

When True, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: False

§xml_keep_tags: bool

If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to XML documents. Default: false

§chunking_strategy: Option<ChunkingStrategy>

Use one of the supported strategies to chunk the returned elements after partitioning. When ‘chunking_strategy’ is not specified, no chunking is performed and any other chunking parameters provided are ignored. Supported strategies: ‘basic’, ‘by_page’, ‘by_similarity’, or ‘by_title’

§combine_under_n_chars: Option<i32>

If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: 500

§include_orig_elements: bool

When a chunking strategy is specified, each returned chunk will include the elements consolidated to form that chunk as .metadata.orig_elements. Default: true.

§max_characters: Option<i32>

If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 500

§multipage_sections: bool

If chunking strategy is set, determines if sections can span multiple sections. Default: true

§new_after_n_chars: Option<i32>

If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: 1500

§overlap: i32

Specifies the length of a string (‘tail’) to be drawn from each chunk and prefixed to the next chunk as a context-preserving mechanism. By default, this only applies to split-chunks where an oversized element is divided into multiple chunks by text-splitting. Default 0.

§overlap_all: bool

When True, apply overlap between ‘normal’ chunks formed from whole elements and not subject to text-splitting. Use this with caution as it entails a certain level of ‘pollution’ of otherwise clean semantic chunk boundaries. Default false.

§similarity_threshold: Option<f64>

A value between 0.0 and 1.0 describing the minimum similarity two elements must have to be included in the same chunk. Note that similar elements may be separated to meet chunk-size criteria; this value can only guarantees that two elements with similarity below the threshold will appear in separate chunks.

Struct PartitionParametersCopy item path

Fields§

Trait Implementations§

impl Debug for PartitionParameters

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Default for PartitionParameters

fn default() -> Self

impl<'de> Deserialize<'de> for PartitionParameters

fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where __D: Deserializer<'de>,

impl From<PartitionParameters> for Form

fn from(value: PartitionParameters) -> Self

impl Serialize for PartitionParameters

fn serialize<__S>(&self, __serializer: __S) -> Result<__S::Ok, __S::Error>where __S: Serializer,

Auto Trait Implementations§

impl Freeze for PartitionParameters

impl RefUnwindSafe for PartitionParameters

impl Send for PartitionParameters

impl Sync for PartitionParameters

impl Unpin for PartitionParameters

impl UnwindSafe for PartitionParameters

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

impl<T> DeserializeOwned for Twhere T: for<'de> Deserialize<'de>,

impl<T> ErasedDestructor for Twhere T: 'static,

Struct PartitionParameters

fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where __D: Deserializer<'de>,

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

impl<T> DeserializeOwned for T
where T: for<'de> Deserialize<'de>,

impl<T> ErasedDestructor for T
where T: 'static,