mecrab/
lib.rs

1//! MeCrab - A high-performance morphological analyzer compatible with MeCab
2//!
3//! Copyright 2026 COOLJAPAN OU (Team KitaSan)
4//!
5//! # Overview
6//!
7//! MeCrab is a pure Rust implementation of a morphological analyzer that is
8//! compatible with MeCab dictionaries (IPADIC format). It provides:
9//!
10//! - Zero-copy parsing where possible
11//! - Memory-mapped dictionary loading via `memmap2`
12//! - Thread-safe design using Rust's ownership model
13//! - Double-Array Trie (DAT) for fast dictionary lookups
14//! - Viterbi algorithm for optimal path finding
15//! - SIMD-accelerated cost calculations using portable SIMD
16//!
17//! # Example
18//!
19//! ```no_run
20//! use mecrab::MeCrab;
21//!
22//! let mecrab = MeCrab::new()?;
23//! let result = mecrab.parse("すもももももももものうち")?;
24//! println!("{}", result);
25//! # Ok::<(), mecrab::Error>(())
26//! ```
27
28#![warn(missing_docs)]
29#![warn(clippy::all)]
30#![warn(clippy::pedantic)]
31#![allow(clippy::module_name_repetitions)]
32#![allow(clippy::must_use_candidate)]
33#![allow(clippy::doc_markdown)]
34#![allow(clippy::cast_possible_truncation)]
35#![allow(clippy::cast_sign_loss)]
36#![allow(clippy::cast_lossless)]
37#![allow(clippy::cast_possible_wrap)]
38#![allow(clippy::similar_names)]
39#![allow(clippy::missing_fields_in_debug)]
40#![allow(clippy::cast_ptr_alignment)]
41#![allow(clippy::ptr_as_ptr)]
42#![allow(clippy::manual_let_else)]
43#![allow(clippy::match_same_arms)]
44#![allow(clippy::explicit_iter_loop)]
45#![allow(clippy::uninlined_format_args)]
46#![allow(clippy::missing_panics_doc)]
47#![allow(clippy::missing_errors_doc)]
48#![allow(clippy::struct_excessive_bools)]
49#![allow(clippy::items_after_statements)]
50#![allow(clippy::cast_precision_loss)]
51#![allow(clippy::redundant_closure_for_method_calls)]
52#![allow(clippy::format_push_string)]
53#![allow(clippy::derivable_impls)]
54#![allow(clippy::map_unwrap_or)]
55#![allow(clippy::collapsible_if)]
56#![allow(clippy::needless_lifetimes)]
57#![allow(clippy::unused_self)]
58#![allow(clippy::return_self_not_must_use)]
59#![allow(clippy::needless_pass_by_value)]
60
61pub mod bench;
62pub mod debug;
63pub mod dict;
64pub mod error;
65pub mod lattice;
66pub mod normalize;
67pub mod phonetic;
68pub mod semantic;
69pub mod stream;
70pub mod vectors;
71pub mod viterbi;
72
73#[cfg(feature = "wasm")]
74pub mod wasm;
75
76#[cfg(feature = "python")]
77pub mod python;
78
79pub use error::{Error, Result};
80
81use std::fmt;
82use std::path::PathBuf;
83use std::sync::Arc;
84
85use dict::Dictionary;
86use lattice::Lattice;
87use viterbi::ViterbiSolver;
88
89/// Output format for morphological analysis results
90#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
91pub enum OutputFormat {
92    /// Default MeCab format: surface\tfeatures
93    #[default]
94    Default,
95    /// Wakati format: space-separated surface forms
96    Wakati,
97    /// Dump all lattice information for debugging
98    Dump,
99    /// JSON output format
100    Json,
101    /// JSON-LD output format with semantic URIs
102    Jsonld,
103    /// Turtle (TTL) RDF format
104    Turtle,
105    /// N-Triples RDF format
106    Ntriples,
107    /// N-Quads RDF format
108    Nquads,
109}
110
111/// A single morpheme (token) in the analysis result
112#[derive(Debug, Clone)]
113pub struct Morpheme {
114    /// Surface form (the actual text)
115    pub surface: String,
116    /// Word ID (token index in dictionary, used for embeddings and training)
117    pub word_id: u32,
118    /// Part-of-speech ID
119    pub pos_id: u16,
120    /// Word cost
121    pub wcost: i16,
122    /// Feature string (comma-separated POS info, reading, etc.)
123    pub feature: String,
124    /// Semantic entity references (optional)
125    pub entities: Vec<semantic::extension::EntityReference>,
126    /// IPA pronunciation (optional, populated when ipa_enabled=true)
127    pub pronunciation: Option<String>,
128    /// Word embedding vector (optional, populated when vector_enabled=true)
129    pub embedding: Option<Vec<f32>>,
130}
131
132impl fmt::Display for Morpheme {
133    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
134        // Main line: MeCab-compatible format
135        write!(f, "{}\t{}", self.surface, self.feature)?;
136
137        // Optional IPA pronunciation line
138        if let Some(ref ipa) = self.pronunciation {
139            write!(f, "\n  IPA: /{}/", ipa)?;
140        }
141
142        // Optional embedding vector (show first 8 dimensions for readability)
143        if let Some(ref emb) = self.embedding {
144            write!(f, "\n  Vector: [")?;
145            let show_dims = emb.len().min(8);
146            for (i, val) in emb.iter().take(show_dims).enumerate() {
147                if i > 0 {
148                    write!(f, ", ")?;
149                }
150                write!(f, "{:.3}", val)?;
151            }
152            if emb.len() > show_dims {
153                write!(f, ", ...")?;
154            }
155            write!(f, "] (dim={})", emb.len())?;
156        }
157
158        Ok(())
159    }
160}
161
162/// Analysis result containing a sequence of morphemes
163#[derive(Debug, Clone)]
164pub struct AnalysisResult {
165    /// The morphemes in the analysis result
166    pub morphemes: Vec<Morpheme>,
167    /// Output format
168    format: OutputFormat,
169}
170
171impl fmt::Display for AnalysisResult {
172    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
173        match self.format {
174            OutputFormat::Default => {
175                for morpheme in &self.morphemes {
176                    writeln!(f, "{morpheme}")?;
177                }
178                writeln!(f, "EOS")
179            }
180            OutputFormat::Wakati => {
181                let surfaces: Vec<&str> =
182                    self.morphemes.iter().map(|m| m.surface.as_str()).collect();
183                writeln!(f, "{}", surfaces.join(" "))
184            }
185            OutputFormat::Dump => {
186                for (i, morpheme) in self.morphemes.iter().enumerate() {
187                    writeln!(
188                        f,
189                        "[{}] {} (pos_id={}, wcost={})\t{}",
190                        i, morpheme.surface, morpheme.pos_id, morpheme.wcost, morpheme.feature
191                    )?;
192                }
193                writeln!(f, "EOS")
194            }
195            OutputFormat::Json => self.format_json(f),
196            OutputFormat::Jsonld => self.format_jsonld(f),
197            OutputFormat::Turtle => self.format_turtle(f),
198            OutputFormat::Ntriples => self.format_ntriples(f),
199            OutputFormat::Nquads => self.format_nquads(f),
200        }
201    }
202}
203
204impl AnalysisResult {
205    /// Format as JSON
206    fn format_json(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
207        write!(f, "[")?;
208        for (i, m) in self.morphemes.iter().enumerate() {
209            if i > 0 {
210                write!(f, ",")?;
211            }
212            write!(
213                f,
214                "{{\"surface\":\"{}\",\"feature\":\"{}\"}}",
215                semantic::jsonld::escape_json(&m.surface),
216                semantic::jsonld::escape_json(&m.feature)
217            )?;
218        }
219        write!(f, "]")
220    }
221
222    /// Format as JSON-LD with semantic URIs
223    fn format_jsonld(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
224        writeln!(f, "{{")?;
225        writeln!(f, "  \"@context\": {{")?;
226        writeln!(f, "    \"wd\": \"http://www.wikidata.org/entity/\",")?;
227        writeln!(f, "    \"dbr\": \"http://dbpedia.org/resource/\",")?;
228        writeln!(f, "    \"schema\": \"http://schema.org/\",")?;
229        writeln!(f, "    \"mecrab\": \"http://mecrab.io/ns#\"")?;
230        writeln!(f, "  }},")?;
231        writeln!(f, "  \"@type\": \"mecrab:Analysis\",")?;
232        writeln!(f, "  \"tokens\": [")?;
233
234        for (i, m) in self.morphemes.iter().enumerate() {
235            // Parse feature string to extract reading if available
236            let features: Vec<&str> = m.feature.split(',').collect();
237            let reading = features.get(7).copied(); // IPADIC format: reading is at index 7
238
239            writeln!(f, "    {{")?;
240            writeln!(
241                f,
242                "      \"surface\": \"{}\",",
243                semantic::jsonld::escape_json(&m.surface)
244            )?;
245            writeln!(
246                f,
247                "      \"pos\": \"{}\",",
248                features.first().copied().unwrap_or("*")
249            )?;
250            if let Some(r) = reading {
251                if r != "*" {
252                    writeln!(f, "      \"reading\": \"{}\",", r)?;
253                }
254            }
255
256            // Add IPA pronunciation if available
257            if let Some(ref ipa) = m.pronunciation {
258                writeln!(f, "      \"pronunciation\": \"/{}/ \",", ipa)?;
259            }
260
261            // Add embedding vector if available
262            if let Some(ref embedding) = m.embedding {
263                write!(f, "      \"embedding\": [")?;
264                for (j, val) in embedding.iter().enumerate() {
265                    if j > 0 {
266                        write!(f, ", ")?;
267                    }
268                    write!(f, "{:.3}", val)?;
269                }
270                writeln!(f, "],")?;
271            }
272
273            // Determine if we need trailing comma after wcost
274            let has_entities = !m.entities.is_empty();
275
276            if has_entities {
277                writeln!(f, "      \"wcost\": {},", m.wcost)?;
278                writeln!(f, "      \"entities\": [")?;
279                for (j, entity) in m.entities.iter().enumerate() {
280                    let compact = semantic::compact_uri(&entity.uri);
281                    write!(
282                        f,
283                        "        {{\"@id\": \"{}\", \"confidence\": {:.2}}}",
284                        compact, entity.confidence
285                    )?;
286                    if j < m.entities.len() - 1 {
287                        writeln!(f, ",")?;
288                    } else {
289                        writeln!(f)?;
290                    }
291                }
292                write!(f, "      ]")?;
293            } else {
294                write!(f, "      \"wcost\": {}", m.wcost)?;
295            }
296
297            if i < self.morphemes.len() - 1 {
298                writeln!(f, "\n    }},")?;
299            } else {
300                writeln!(f, "\n    }}")?;
301            }
302        }
303
304        writeln!(f, "  ]")?;
305        write!(f, "}}")
306    }
307
308    /// Format as Turtle (TTL) RDF
309    fn format_turtle(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
310        // Prepare token data for export
311        let tokens: Vec<(String, String, Option<String>, Vec<semantic::SemanticEntry>)> = self
312            .morphemes
313            .iter()
314            .map(|m| {
315                let features: Vec<&str> = m.feature.split(',').collect();
316                let pos = features.first().copied().unwrap_or("*").to_string();
317                let reading = features
318                    .get(7)
319                    .filter(|&&r| r != "*")
320                    .map(|&r| r.to_string());
321
322                // Convert EntityReference to SemanticEntry
323                let entities: Vec<semantic::SemanticEntry> = m
324                    .entities
325                    .iter()
326                    .map(|e| {
327                        semantic::SemanticEntry::new(
328                            &e.uri,
329                            e.confidence,
330                            semantic::OntologySource::Wikidata,
331                        )
332                    })
333                    .collect();
334
335                (m.surface.clone(), pos, reading, entities)
336            })
337            .collect();
338
339        let turtle = semantic::rdf::export_turtle(&tokens, "http://example.org/analysis");
340        write!(f, "{}", turtle)
341    }
342
343    /// Format as N-Triples RDF
344    fn format_ntriples(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
345        // Prepare token data for export
346        let tokens: Vec<(String, String, Option<String>, Vec<semantic::SemanticEntry>)> = self
347            .morphemes
348            .iter()
349            .map(|m| {
350                let features: Vec<&str> = m.feature.split(',').collect();
351                let pos = features.first().copied().unwrap_or("*").to_string();
352                let reading = features
353                    .get(7)
354                    .filter(|&&r| r != "*")
355                    .map(|&r| r.to_string());
356
357                let entities: Vec<semantic::SemanticEntry> = m
358                    .entities
359                    .iter()
360                    .map(|e| {
361                        semantic::SemanticEntry::new(
362                            &e.uri,
363                            e.confidence,
364                            semantic::OntologySource::Wikidata,
365                        )
366                    })
367                    .collect();
368
369                (m.surface.clone(), pos, reading, entities)
370            })
371            .collect();
372
373        let ntriples = semantic::rdf::export_ntriples(&tokens, "http://example.org/analysis");
374        write!(f, "{}", ntriples)
375    }
376
377    /// Format as N-Quads RDF
378    fn format_nquads(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
379        // Prepare token data for export
380        let tokens: Vec<(String, String, Option<String>, Vec<semantic::SemanticEntry>)> = self
381            .morphemes
382            .iter()
383            .map(|m| {
384                let features: Vec<&str> = m.feature.split(',').collect();
385                let pos = features.first().copied().unwrap_or("*").to_string();
386                let reading = features
387                    .get(7)
388                    .filter(|&&r| r != "*")
389                    .map(|&r| r.to_string());
390
391                let entities: Vec<semantic::SemanticEntry> = m
392                    .entities
393                    .iter()
394                    .map(|e| {
395                        semantic::SemanticEntry::new(
396                            &e.uri,
397                            e.confidence,
398                            semantic::OntologySource::Wikidata,
399                        )
400                    })
401                    .collect();
402
403                (m.surface.clone(), pos, reading, entities)
404            })
405            .collect();
406
407        let nquads = semantic::rdf::export_nquads(
408            &tokens,
409            "http://example.org/analysis",
410            "http://example.org/graph",
411        );
412        write!(f, "{}", nquads)
413    }
414}
415
416/// Builder for configuring MeCrab instance
417#[derive(Debug, Default)]
418pub struct MeCrabBuilder {
419    dicdir: Option<PathBuf>,
420    userdic: Option<PathBuf>,
421    semantic_pool: Option<PathBuf>,
422    vector_pool: Option<PathBuf>,
423    with_semantic: bool,
424    with_ipa: bool,
425    with_vector: bool,
426    output_format: OutputFormat,
427}
428
429impl MeCrabBuilder {
430    /// Create a new builder with default settings
431    #[must_use]
432    pub fn new() -> Self {
433        Self::default()
434    }
435
436    /// Set the dictionary directory
437    #[must_use]
438    pub fn dicdir(mut self, path: Option<PathBuf>) -> Self {
439        self.dicdir = path;
440        self
441    }
442
443    /// Set the user dictionary path
444    #[must_use]
445    pub fn userdic(mut self, path: Option<PathBuf>) -> Self {
446        self.userdic = path;
447        self
448    }
449
450    /// Set the semantic pool path
451    #[must_use]
452    pub fn semantic_pool(mut self, path: Option<PathBuf>) -> Self {
453        self.semantic_pool = path;
454        self
455    }
456
457    /// Enable semantic URI output (requires semantic pool to be loaded)
458    #[must_use]
459    pub fn with_semantic(mut self, enabled: bool) -> Self {
460        self.with_semantic = enabled;
461        self
462    }
463
464    /// Enable IPA pronunciation output
465    #[must_use]
466    pub fn with_ipa(mut self, enabled: bool) -> Self {
467        self.with_ipa = enabled;
468        self
469    }
470
471    /// Set the vector pool file path (vectors.bin)
472    #[must_use]
473    pub fn vector_pool(mut self, path: Option<PathBuf>) -> Self {
474        self.vector_pool = path;
475        self
476    }
477
478    /// Enable vector embedding output
479    #[must_use]
480    pub fn with_vector(mut self, enabled: bool) -> Self {
481        self.with_vector = enabled;
482        self
483    }
484
485    /// Set the output format
486    #[must_use]
487    pub fn output_format(mut self, format: OutputFormat) -> Self {
488        self.output_format = format;
489        self
490    }
491
492    /// Build the MeCrab instance
493    ///
494    /// # Errors
495    ///
496    /// Returns an error if the dictionary cannot be loaded.
497    pub fn build(self) -> Result<MeCrab> {
498        let dictionary = match (self.dicdir, self.semantic_pool) {
499            (Some(dicdir), Some(semantic_path)) => {
500                Dictionary::load_with_semantics(&dicdir, &semantic_path)?
501            }
502            (Some(dicdir), None) => {
503                // Try to auto-load semantic.bin from dicdir if it exists
504                let semantic_path = dicdir.join("semantic.bin");
505                if semantic_path.exists() {
506                    Dictionary::load_with_semantics(&dicdir, &semantic_path)?
507                } else {
508                    Dictionary::load(&dicdir)?
509                }
510            }
511            (None, Some(semantic_path)) => {
512                let dict = Dictionary::default_dictionary()?;
513                let pool_file = std::fs::File::open(&semantic_path)?;
514                let pool_data = unsafe { memmap2::Mmap::map(&pool_file)? };
515                let pool = crate::semantic::pool::SemanticPool::from_bytes(&pool_data)?;
516                let mut dict_mut = dict;
517                dict_mut.semantic_pool = Some(Arc::new(pool));
518                dict_mut
519            }
520            (None, None) => {
521                // Try to auto-load from default directory
522                Dictionary::default_dictionary()?
523            }
524        };
525
526        // Load vector store if path provided
527        let vector_store = if let Some(vector_path) = self.vector_pool {
528            Some(Arc::new(vectors::VectorStore::from_file(&vector_path)?))
529        } else {
530            None
531        };
532
533        Ok(MeCrab {
534            dictionary: Arc::new(dictionary),
535            output_format: self.output_format,
536            semantic_enabled: self.with_semantic,
537            ipa_enabled: self.with_ipa,
538            vector_enabled: self.with_vector,
539            vector_store,
540        })
541    }
542}
543
544/// The main MeCrab morphological analyzer
545#[derive(Clone)]
546pub struct MeCrab {
547    dictionary: Arc<Dictionary>,
548    output_format: OutputFormat,
549    semantic_enabled: bool,
550    ipa_enabled: bool,
551    vector_enabled: bool,
552    vector_store: Option<Arc<vectors::VectorStore>>,
553}
554
555impl MeCrab {
556    /// Create a new MeCrab instance with default dictionary
557    ///
558    /// # Errors
559    ///
560    /// Returns an error if the default dictionary cannot be found or loaded.
561    pub fn new() -> Result<Self> {
562        Self::builder().build()
563    }
564
565    /// Create a builder for configuring MeCrab
566    #[must_use]
567    pub fn builder() -> MeCrabBuilder {
568        MeCrabBuilder::new()
569    }
570
571    /// Parse the input text and return analysis result
572    ///
573    /// # Errors
574    ///
575    /// Returns an error if parsing fails.
576    pub fn parse(&self, text: &str) -> Result<AnalysisResult> {
577        // Build the lattice
578        let lattice = Lattice::build(text, &self.dictionary)?;
579
580        // Solve using Viterbi algorithm
581        let solver = ViterbiSolver::new(&self.dictionary);
582        let path = solver.solve(&lattice)?;
583
584        // Convert path to morphemes with optional semantic and IPA enrichment
585        let morphemes = path
586            .into_iter()
587            .map(|node| {
588                let entities = if self.semantic_enabled {
589                    self.get_entities_for_surface(&node.surface)
590                } else {
591                    Vec::new()
592                };
593
594                let pronunciation = if self.ipa_enabled {
595                    self.get_ipa_pronunciation(&node.feature)
596                } else {
597                    None
598                };
599
600                let embedding = if self.vector_enabled {
601                    self.get_embedding(node.word_id)
602                } else {
603                    None
604                };
605
606                Morpheme {
607                    surface: node.surface,
608                    word_id: node.word_id,
609                    pos_id: node.pos_id,
610                    wcost: node.wcost,
611                    feature: node.feature,
612                    entities,
613                    pronunciation,
614                    embedding,
615                }
616            })
617            .collect();
618
619        Ok(AnalysisResult {
620            morphemes,
621            format: self.output_format,
622        })
623    }
624
625    /// Get semantic entities for a surface form
626    fn get_entities_for_surface(&self, surface: &str) -> Vec<semantic::EntityReference> {
627        if let Some(ref surface_map) = self.dictionary.surface_map {
628            if let Some(uris) = surface_map.get(surface) {
629                return uris
630                    .iter()
631                    .map(|(uri, confidence)| {
632                        let source = if uri.contains("wikidata.org") {
633                            semantic::OntologySource::Wikidata
634                        } else if uri.contains("dbpedia.org") {
635                            semantic::OntologySource::DBpedia
636                        } else {
637                            semantic::OntologySource::Custom
638                        };
639                        semantic::EntityReference::new(uri.clone(), *confidence, source)
640                    })
641                    .collect();
642            }
643        }
644        Vec::new()
645    }
646
647    /// Get IPA pronunciation from feature string
648    fn get_ipa_pronunciation(&self, feature: &str) -> Option<String> {
649        // Feature format: POS,POS1,POS2,POS3,conjugation,conjugation_type,lemma,reading,pronunciation
650        let fields: Vec<&str> = feature.split(',').collect();
651
652        // Get POS for particle detection
653        let pos = fields.first().copied().unwrap_or("");
654
655        // PRIORITY 1: Pronunciation field (index 8) - actual pronunciation
656        // This already contains the correct pronunciation (e.g., "ワ" for particle "は")
657        if let Some(&pron) = fields.get(8) {
658            if pron != "*" && !pron.is_empty() {
659                return Some(phonetic::to_ipa(pron));
660            }
661        }
662
663        // PRIORITY 2: Reading field (index 7) - fallback if pronunciation not available
664        if let Some(&reading) = fields.get(7) {
665            if reading != "*" && !reading.is_empty() {
666                // Special handling for particles with pronunciation changes
667                if pos == "助詞" {
668                    let ipa = match reading {
669                        "ハ" => "wa", // 助詞「は」は /wa/ と発音
670                        "ヘ" => "e",  // 助詞「へ」は /e/ と発音
671                        "ヲ" => "o",  // 助詞「を」は /o/ と発音
672                        _ => return Some(phonetic::to_ipa(reading)),
673                    };
674                    return Some(ipa.to_string());
675                }
676                return Some(phonetic::to_ipa(reading));
677            }
678        }
679
680        None
681    }
682
683    /// Get word embedding vector for a given word ID
684    ///
685    /// Returns None if:
686    /// - No vector store is loaded
687    /// - word_id is u32::MAX (overlay/unknown words)
688    /// - word_id is out of bounds in the vector store
689    fn get_embedding(&self, word_id: u32) -> Option<Vec<f32>> {
690        // Skip overlay/unknown words (marked with u32::MAX)
691        if word_id == u32::MAX {
692            return None;
693        }
694
695        self.vector_store
696            .as_ref()
697            .and_then(|store| store.get(word_id))
698            .map(|slice| slice.to_vec())
699    }
700
701    /// Parse the input text and return wakati (space-separated) output
702    ///
703    /// # Errors
704    ///
705    /// Returns an error if parsing fails.
706    pub fn wakati(&self, text: &str) -> Result<String> {
707        let result = self.parse(text)?;
708        let surfaces: Vec<&str> = result
709            .morphemes
710            .iter()
711            .map(|m| m.surface.as_str())
712            .collect();
713        Ok(surfaces.join(" "))
714    }
715
716    /// Parse multiple texts in parallel using Rayon
717    ///
718    /// This method leverages all available CPU cores for batch processing,
719    /// providing significant speedup for large workloads.
720    ///
721    /// # Errors
722    ///
723    /// Returns a vector of results, where each result may be an error.
724    #[cfg(feature = "parallel")]
725    pub fn parse_batch(&self, texts: &[&str]) -> Vec<Result<AnalysisResult>> {
726        use rayon::prelude::*;
727        texts.par_iter().map(|text| self.parse(text)).collect()
728    }
729
730    /// Parse multiple texts sequentially (fallback when parallel feature is disabled)
731    #[cfg(not(feature = "parallel"))]
732    pub fn parse_batch(&self, texts: &[&str]) -> Vec<Result<AnalysisResult>> {
733        texts.iter().map(|text| self.parse(text)).collect()
734    }
735
736    /// Parse multiple texts and return wakati outputs in parallel
737    ///
738    /// # Errors
739    ///
740    /// Returns a vector of results.
741    #[cfg(feature = "parallel")]
742    pub fn wakati_batch(&self, texts: &[&str]) -> Vec<Result<String>> {
743        use rayon::prelude::*;
744        texts.par_iter().map(|text| self.wakati(text)).collect()
745    }
746
747    /// Parse multiple texts and return wakati outputs sequentially
748    #[cfg(not(feature = "parallel"))]
749    pub fn wakati_batch(&self, texts: &[&str]) -> Vec<Result<String>> {
750        texts.iter().map(|text| self.wakati(text)).collect()
751    }
752
753    /// Add a word to the dictionary at runtime
754    ///
755    /// This is a key feature for production systems that need to handle
756    /// new vocabulary (product names, trending terms, etc.) without restart.
757    ///
758    /// # Arguments
759    ///
760    /// * `surface` - The surface form (the actual text)
761    /// * `reading` - The katakana reading
762    /// * `pronunciation` - The pronunciation (often same as reading)
763    /// * `wcost` - Word cost (lower = more preferred, typical: 5000-8000)
764    ///
765    /// # Example
766    ///
767    /// ```ignore
768    /// let mecrab = MeCrab::new()?;
769    ///
770    /// // Add a new word
771    /// mecrab.add_word("ChatGPT", "チャットジーピーティー", "チャットジーピーティー", 5000);
772    ///
773    /// // Now it will be recognized
774    /// let result = mecrab.parse("ChatGPTを使う")?;
775    /// ```
776    pub fn add_word(&self, surface: &str, reading: &str, pronunciation: &str, wcost: i16) {
777        self.dictionary
778            .add_simple_word(surface, reading, pronunciation, wcost);
779    }
780
781    /// Remove a word from the overlay dictionary
782    ///
783    /// Returns true if the word was found and removed.
784    /// Note: Only overlay words can be removed; system dictionary entries persist.
785    pub fn remove_word(&self, surface: &str) -> bool {
786        self.dictionary.remove_word(surface)
787    }
788
789    /// Get the number of words in the overlay dictionary
790    pub fn overlay_size(&self) -> usize {
791        self.dictionary.overlay_size()
792    }
793
794    /// Parse the input text and return N-best analysis results
795    ///
796    /// Returns multiple alternative analyses ranked by cost, useful for
797    /// disambiguation and exploring alternative segmentations.
798    ///
799    /// # Arguments
800    ///
801    /// * `text` - The input text to analyze
802    /// * `n` - Number of best paths to return
803    ///
804    /// # Errors
805    ///
806    /// Returns an error if parsing fails.
807    pub fn parse_nbest(&self, text: &str, n: usize) -> Result<Vec<(AnalysisResult, i64)>> {
808        // Build the lattice
809        let lattice = Lattice::build(text, &self.dictionary)?;
810
811        // Solve using Viterbi algorithm with N-best
812        let solver = ViterbiSolver::new(&self.dictionary);
813        let paths = solver.solve_nbest(&lattice, n)?;
814
815        // Convert paths to analysis results
816        let results = paths
817            .into_iter()
818            .map(|(path, cost)| {
819                let morphemes = path
820                    .into_iter()
821                    .map(|node| {
822                        let entities = if self.semantic_enabled {
823                            self.get_entities_for_surface(&node.surface)
824                        } else {
825                            Vec::new()
826                        };
827
828                        let pronunciation = if self.ipa_enabled {
829                            self.get_ipa_pronunciation(&node.feature)
830                        } else {
831                            None
832                        };
833
834                        let embedding = if self.vector_enabled {
835                            self.get_embedding(node.word_id)
836                        } else {
837                            None
838                        };
839
840                        Morpheme {
841                            surface: node.surface,
842                            word_id: node.word_id,
843                            pos_id: node.pos_id,
844                            wcost: node.wcost,
845                            feature: node.feature,
846                            entities,
847                            pronunciation,
848                            embedding,
849                        }
850                    })
851                    .collect();
852
853                (
854                    AnalysisResult {
855                        morphemes,
856                        format: self.output_format,
857                    },
858                    cost,
859                )
860            })
861            .collect();
862
863        Ok(results)
864    }
865}
866
867#[cfg(test)]
868mod tests {
869    use super::*;
870
871    #[test]
872    fn test_builder_default() {
873        let builder = MeCrab::builder();
874        assert!(builder.dicdir.is_none());
875        assert!(builder.userdic.is_none());
876        assert_eq!(builder.output_format, OutputFormat::Default);
877    }
878}
mecrab/lib.rs

mecrab/
lib.rs