1#![warn(missing_docs)]
29#![warn(clippy::all)]
30#![warn(clippy::pedantic)]
31#![allow(clippy::module_name_repetitions)]
32#![allow(clippy::must_use_candidate)]
33#![allow(clippy::doc_markdown)]
34#![allow(clippy::cast_possible_truncation)]
35#![allow(clippy::cast_sign_loss)]
36#![allow(clippy::cast_lossless)]
37#![allow(clippy::cast_possible_wrap)]
38#![allow(clippy::similar_names)]
39#![allow(clippy::missing_fields_in_debug)]
40#![allow(clippy::cast_ptr_alignment)]
41#![allow(clippy::ptr_as_ptr)]
42#![allow(clippy::manual_let_else)]
43#![allow(clippy::match_same_arms)]
44#![allow(clippy::explicit_iter_loop)]
45#![allow(clippy::uninlined_format_args)]
46#![allow(clippy::missing_panics_doc)]
47#![allow(clippy::missing_errors_doc)]
48#![allow(clippy::struct_excessive_bools)]
49#![allow(clippy::items_after_statements)]
50#![allow(clippy::cast_precision_loss)]
51#![allow(clippy::redundant_closure_for_method_calls)]
52#![allow(clippy::format_push_string)]
53#![allow(clippy::derivable_impls)]
54#![allow(clippy::map_unwrap_or)]
55#![allow(clippy::collapsible_if)]
56#![allow(clippy::needless_lifetimes)]
57#![allow(clippy::unused_self)]
58#![allow(clippy::return_self_not_must_use)]
59#![allow(clippy::needless_pass_by_value)]
60
61pub mod bench;
62pub mod debug;
63pub mod dict;
64pub mod error;
65pub mod lattice;
66pub mod normalize;
67pub mod phonetic;
68pub mod semantic;
69pub mod stream;
70pub mod vectors;
71pub mod viterbi;
72
73#[cfg(feature = "wasm")]
74pub mod wasm;
75
76#[cfg(feature = "python")]
77pub mod python;
78
79pub use error::{Error, Result};
80
81use std::fmt;
82use std::path::PathBuf;
83use std::sync::Arc;
84
85use dict::Dictionary;
86use lattice::Lattice;
87use viterbi::ViterbiSolver;
88
89#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
91pub enum OutputFormat {
92 #[default]
94 Default,
95 Wakati,
97 Dump,
99 Json,
101 Jsonld,
103 Turtle,
105 Ntriples,
107 Nquads,
109}
110
111#[derive(Debug, Clone)]
113pub struct Morpheme {
114 pub surface: String,
116 pub word_id: u32,
118 pub pos_id: u16,
120 pub wcost: i16,
122 pub feature: String,
124 pub entities: Vec<semantic::extension::EntityReference>,
126 pub pronunciation: Option<String>,
128 pub embedding: Option<Vec<f32>>,
130}
131
132impl fmt::Display for Morpheme {
133 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
134 write!(f, "{}\t{}", self.surface, self.feature)?;
136
137 if let Some(ref ipa) = self.pronunciation {
139 write!(f, "\n IPA: /{}/", ipa)?;
140 }
141
142 if let Some(ref emb) = self.embedding {
144 write!(f, "\n Vector: [")?;
145 let show_dims = emb.len().min(8);
146 for (i, val) in emb.iter().take(show_dims).enumerate() {
147 if i > 0 {
148 write!(f, ", ")?;
149 }
150 write!(f, "{:.3}", val)?;
151 }
152 if emb.len() > show_dims {
153 write!(f, ", ...")?;
154 }
155 write!(f, "] (dim={})", emb.len())?;
156 }
157
158 Ok(())
159 }
160}
161
162#[derive(Debug, Clone)]
164pub struct AnalysisResult {
165 pub morphemes: Vec<Morpheme>,
167 format: OutputFormat,
169}
170
171impl fmt::Display for AnalysisResult {
172 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
173 match self.format {
174 OutputFormat::Default => {
175 for morpheme in &self.morphemes {
176 writeln!(f, "{morpheme}")?;
177 }
178 writeln!(f, "EOS")
179 }
180 OutputFormat::Wakati => {
181 let surfaces: Vec<&str> =
182 self.morphemes.iter().map(|m| m.surface.as_str()).collect();
183 writeln!(f, "{}", surfaces.join(" "))
184 }
185 OutputFormat::Dump => {
186 for (i, morpheme) in self.morphemes.iter().enumerate() {
187 writeln!(
188 f,
189 "[{}] {} (pos_id={}, wcost={})\t{}",
190 i, morpheme.surface, morpheme.pos_id, morpheme.wcost, morpheme.feature
191 )?;
192 }
193 writeln!(f, "EOS")
194 }
195 OutputFormat::Json => self.format_json(f),
196 OutputFormat::Jsonld => self.format_jsonld(f),
197 OutputFormat::Turtle => self.format_turtle(f),
198 OutputFormat::Ntriples => self.format_ntriples(f),
199 OutputFormat::Nquads => self.format_nquads(f),
200 }
201 }
202}
203
204impl AnalysisResult {
205 fn format_json(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
207 write!(f, "[")?;
208 for (i, m) in self.morphemes.iter().enumerate() {
209 if i > 0 {
210 write!(f, ",")?;
211 }
212 write!(
213 f,
214 "{{\"surface\":\"{}\",\"feature\":\"{}\"}}",
215 semantic::jsonld::escape_json(&m.surface),
216 semantic::jsonld::escape_json(&m.feature)
217 )?;
218 }
219 write!(f, "]")
220 }
221
222 fn format_jsonld(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
224 writeln!(f, "{{")?;
225 writeln!(f, " \"@context\": {{")?;
226 writeln!(f, " \"wd\": \"http://www.wikidata.org/entity/\",")?;
227 writeln!(f, " \"dbr\": \"http://dbpedia.org/resource/\",")?;
228 writeln!(f, " \"schema\": \"http://schema.org/\",")?;
229 writeln!(f, " \"mecrab\": \"http://mecrab.io/ns#\"")?;
230 writeln!(f, " }},")?;
231 writeln!(f, " \"@type\": \"mecrab:Analysis\",")?;
232 writeln!(f, " \"tokens\": [")?;
233
234 for (i, m) in self.morphemes.iter().enumerate() {
235 let features: Vec<&str> = m.feature.split(',').collect();
237 let reading = features.get(7).copied(); writeln!(f, " {{")?;
240 writeln!(
241 f,
242 " \"surface\": \"{}\",",
243 semantic::jsonld::escape_json(&m.surface)
244 )?;
245 writeln!(
246 f,
247 " \"pos\": \"{}\",",
248 features.first().copied().unwrap_or("*")
249 )?;
250 if let Some(r) = reading {
251 if r != "*" {
252 writeln!(f, " \"reading\": \"{}\",", r)?;
253 }
254 }
255
256 if let Some(ref ipa) = m.pronunciation {
258 writeln!(f, " \"pronunciation\": \"/{}/ \",", ipa)?;
259 }
260
261 if let Some(ref embedding) = m.embedding {
263 write!(f, " \"embedding\": [")?;
264 for (j, val) in embedding.iter().enumerate() {
265 if j > 0 {
266 write!(f, ", ")?;
267 }
268 write!(f, "{:.3}", val)?;
269 }
270 writeln!(f, "],")?;
271 }
272
273 let has_entities = !m.entities.is_empty();
275
276 if has_entities {
277 writeln!(f, " \"wcost\": {},", m.wcost)?;
278 writeln!(f, " \"entities\": [")?;
279 for (j, entity) in m.entities.iter().enumerate() {
280 let compact = semantic::compact_uri(&entity.uri);
281 write!(
282 f,
283 " {{\"@id\": \"{}\", \"confidence\": {:.2}}}",
284 compact, entity.confidence
285 )?;
286 if j < m.entities.len() - 1 {
287 writeln!(f, ",")?;
288 } else {
289 writeln!(f)?;
290 }
291 }
292 write!(f, " ]")?;
293 } else {
294 write!(f, " \"wcost\": {}", m.wcost)?;
295 }
296
297 if i < self.morphemes.len() - 1 {
298 writeln!(f, "\n }},")?;
299 } else {
300 writeln!(f, "\n }}")?;
301 }
302 }
303
304 writeln!(f, " ]")?;
305 write!(f, "}}")
306 }
307
308 fn format_turtle(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
310 let tokens: Vec<(String, String, Option<String>, Vec<semantic::SemanticEntry>)> = self
312 .morphemes
313 .iter()
314 .map(|m| {
315 let features: Vec<&str> = m.feature.split(',').collect();
316 let pos = features.first().copied().unwrap_or("*").to_string();
317 let reading = features
318 .get(7)
319 .filter(|&&r| r != "*")
320 .map(|&r| r.to_string());
321
322 let entities: Vec<semantic::SemanticEntry> = m
324 .entities
325 .iter()
326 .map(|e| {
327 semantic::SemanticEntry::new(
328 &e.uri,
329 e.confidence,
330 semantic::OntologySource::Wikidata,
331 )
332 })
333 .collect();
334
335 (m.surface.clone(), pos, reading, entities)
336 })
337 .collect();
338
339 let turtle = semantic::rdf::export_turtle(&tokens, "http://example.org/analysis");
340 write!(f, "{}", turtle)
341 }
342
343 fn format_ntriples(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
345 let tokens: Vec<(String, String, Option<String>, Vec<semantic::SemanticEntry>)> = self
347 .morphemes
348 .iter()
349 .map(|m| {
350 let features: Vec<&str> = m.feature.split(',').collect();
351 let pos = features.first().copied().unwrap_or("*").to_string();
352 let reading = features
353 .get(7)
354 .filter(|&&r| r != "*")
355 .map(|&r| r.to_string());
356
357 let entities: Vec<semantic::SemanticEntry> = m
358 .entities
359 .iter()
360 .map(|e| {
361 semantic::SemanticEntry::new(
362 &e.uri,
363 e.confidence,
364 semantic::OntologySource::Wikidata,
365 )
366 })
367 .collect();
368
369 (m.surface.clone(), pos, reading, entities)
370 })
371 .collect();
372
373 let ntriples = semantic::rdf::export_ntriples(&tokens, "http://example.org/analysis");
374 write!(f, "{}", ntriples)
375 }
376
377 fn format_nquads(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
379 let tokens: Vec<(String, String, Option<String>, Vec<semantic::SemanticEntry>)> = self
381 .morphemes
382 .iter()
383 .map(|m| {
384 let features: Vec<&str> = m.feature.split(',').collect();
385 let pos = features.first().copied().unwrap_or("*").to_string();
386 let reading = features
387 .get(7)
388 .filter(|&&r| r != "*")
389 .map(|&r| r.to_string());
390
391 let entities: Vec<semantic::SemanticEntry> = m
392 .entities
393 .iter()
394 .map(|e| {
395 semantic::SemanticEntry::new(
396 &e.uri,
397 e.confidence,
398 semantic::OntologySource::Wikidata,
399 )
400 })
401 .collect();
402
403 (m.surface.clone(), pos, reading, entities)
404 })
405 .collect();
406
407 let nquads = semantic::rdf::export_nquads(
408 &tokens,
409 "http://example.org/analysis",
410 "http://example.org/graph",
411 );
412 write!(f, "{}", nquads)
413 }
414}
415
416#[derive(Debug, Default)]
418pub struct MeCrabBuilder {
419 dicdir: Option<PathBuf>,
420 userdic: Option<PathBuf>,
421 semantic_pool: Option<PathBuf>,
422 vector_pool: Option<PathBuf>,
423 with_semantic: bool,
424 with_ipa: bool,
425 with_vector: bool,
426 output_format: OutputFormat,
427}
428
429impl MeCrabBuilder {
430 #[must_use]
432 pub fn new() -> Self {
433 Self::default()
434 }
435
436 #[must_use]
438 pub fn dicdir(mut self, path: Option<PathBuf>) -> Self {
439 self.dicdir = path;
440 self
441 }
442
443 #[must_use]
445 pub fn userdic(mut self, path: Option<PathBuf>) -> Self {
446 self.userdic = path;
447 self
448 }
449
450 #[must_use]
452 pub fn semantic_pool(mut self, path: Option<PathBuf>) -> Self {
453 self.semantic_pool = path;
454 self
455 }
456
457 #[must_use]
459 pub fn with_semantic(mut self, enabled: bool) -> Self {
460 self.with_semantic = enabled;
461 self
462 }
463
464 #[must_use]
466 pub fn with_ipa(mut self, enabled: bool) -> Self {
467 self.with_ipa = enabled;
468 self
469 }
470
471 #[must_use]
473 pub fn vector_pool(mut self, path: Option<PathBuf>) -> Self {
474 self.vector_pool = path;
475 self
476 }
477
478 #[must_use]
480 pub fn with_vector(mut self, enabled: bool) -> Self {
481 self.with_vector = enabled;
482 self
483 }
484
485 #[must_use]
487 pub fn output_format(mut self, format: OutputFormat) -> Self {
488 self.output_format = format;
489 self
490 }
491
492 pub fn build(self) -> Result<MeCrab> {
498 let dictionary = match (self.dicdir, self.semantic_pool) {
499 (Some(dicdir), Some(semantic_path)) => {
500 Dictionary::load_with_semantics(&dicdir, &semantic_path)?
501 }
502 (Some(dicdir), None) => {
503 let semantic_path = dicdir.join("semantic.bin");
505 if semantic_path.exists() {
506 Dictionary::load_with_semantics(&dicdir, &semantic_path)?
507 } else {
508 Dictionary::load(&dicdir)?
509 }
510 }
511 (None, Some(semantic_path)) => {
512 let dict = Dictionary::default_dictionary()?;
513 let pool_file = std::fs::File::open(&semantic_path)?;
514 let pool_data = unsafe { memmap2::Mmap::map(&pool_file)? };
515 let pool = crate::semantic::pool::SemanticPool::from_bytes(&pool_data)?;
516 let mut dict_mut = dict;
517 dict_mut.semantic_pool = Some(Arc::new(pool));
518 dict_mut
519 }
520 (None, None) => {
521 Dictionary::default_dictionary()?
523 }
524 };
525
526 let vector_store = if let Some(vector_path) = self.vector_pool {
528 Some(Arc::new(vectors::VectorStore::from_file(&vector_path)?))
529 } else {
530 None
531 };
532
533 Ok(MeCrab {
534 dictionary: Arc::new(dictionary),
535 output_format: self.output_format,
536 semantic_enabled: self.with_semantic,
537 ipa_enabled: self.with_ipa,
538 vector_enabled: self.with_vector,
539 vector_store,
540 })
541 }
542}
543
544#[derive(Clone)]
546pub struct MeCrab {
547 dictionary: Arc<Dictionary>,
548 output_format: OutputFormat,
549 semantic_enabled: bool,
550 ipa_enabled: bool,
551 vector_enabled: bool,
552 vector_store: Option<Arc<vectors::VectorStore>>,
553}
554
555impl MeCrab {
556 pub fn new() -> Result<Self> {
562 Self::builder().build()
563 }
564
565 #[must_use]
567 pub fn builder() -> MeCrabBuilder {
568 MeCrabBuilder::new()
569 }
570
571 pub fn parse(&self, text: &str) -> Result<AnalysisResult> {
577 let lattice = Lattice::build(text, &self.dictionary)?;
579
580 let solver = ViterbiSolver::new(&self.dictionary);
582 let path = solver.solve(&lattice)?;
583
584 let morphemes = path
586 .into_iter()
587 .map(|node| {
588 let entities = if self.semantic_enabled {
589 self.get_entities_for_surface(&node.surface)
590 } else {
591 Vec::new()
592 };
593
594 let pronunciation = if self.ipa_enabled {
595 self.get_ipa_pronunciation(&node.feature)
596 } else {
597 None
598 };
599
600 let embedding = if self.vector_enabled {
601 self.get_embedding(node.word_id)
602 } else {
603 None
604 };
605
606 Morpheme {
607 surface: node.surface,
608 word_id: node.word_id,
609 pos_id: node.pos_id,
610 wcost: node.wcost,
611 feature: node.feature,
612 entities,
613 pronunciation,
614 embedding,
615 }
616 })
617 .collect();
618
619 Ok(AnalysisResult {
620 morphemes,
621 format: self.output_format,
622 })
623 }
624
625 fn get_entities_for_surface(&self, surface: &str) -> Vec<semantic::EntityReference> {
627 if let Some(ref surface_map) = self.dictionary.surface_map {
628 if let Some(uris) = surface_map.get(surface) {
629 return uris
630 .iter()
631 .map(|(uri, confidence)| {
632 let source = if uri.contains("wikidata.org") {
633 semantic::OntologySource::Wikidata
634 } else if uri.contains("dbpedia.org") {
635 semantic::OntologySource::DBpedia
636 } else {
637 semantic::OntologySource::Custom
638 };
639 semantic::EntityReference::new(uri.clone(), *confidence, source)
640 })
641 .collect();
642 }
643 }
644 Vec::new()
645 }
646
647 fn get_ipa_pronunciation(&self, feature: &str) -> Option<String> {
649 let fields: Vec<&str> = feature.split(',').collect();
651
652 let pos = fields.first().copied().unwrap_or("");
654
655 if let Some(&pron) = fields.get(8) {
658 if pron != "*" && !pron.is_empty() {
659 return Some(phonetic::to_ipa(pron));
660 }
661 }
662
663 if let Some(&reading) = fields.get(7) {
665 if reading != "*" && !reading.is_empty() {
666 if pos == "助詞" {
668 let ipa = match reading {
669 "ハ" => "wa", "ヘ" => "e", "ヲ" => "o", _ => return Some(phonetic::to_ipa(reading)),
673 };
674 return Some(ipa.to_string());
675 }
676 return Some(phonetic::to_ipa(reading));
677 }
678 }
679
680 None
681 }
682
683 fn get_embedding(&self, word_id: u32) -> Option<Vec<f32>> {
690 if word_id == u32::MAX {
692 return None;
693 }
694
695 self.vector_store
696 .as_ref()
697 .and_then(|store| store.get(word_id))
698 .map(|slice| slice.to_vec())
699 }
700
701 pub fn wakati(&self, text: &str) -> Result<String> {
707 let result = self.parse(text)?;
708 let surfaces: Vec<&str> = result
709 .morphemes
710 .iter()
711 .map(|m| m.surface.as_str())
712 .collect();
713 Ok(surfaces.join(" "))
714 }
715
716 #[cfg(feature = "parallel")]
725 pub fn parse_batch(&self, texts: &[&str]) -> Vec<Result<AnalysisResult>> {
726 use rayon::prelude::*;
727 texts.par_iter().map(|text| self.parse(text)).collect()
728 }
729
730 #[cfg(not(feature = "parallel"))]
732 pub fn parse_batch(&self, texts: &[&str]) -> Vec<Result<AnalysisResult>> {
733 texts.iter().map(|text| self.parse(text)).collect()
734 }
735
736 #[cfg(feature = "parallel")]
742 pub fn wakati_batch(&self, texts: &[&str]) -> Vec<Result<String>> {
743 use rayon::prelude::*;
744 texts.par_iter().map(|text| self.wakati(text)).collect()
745 }
746
747 #[cfg(not(feature = "parallel"))]
749 pub fn wakati_batch(&self, texts: &[&str]) -> Vec<Result<String>> {
750 texts.iter().map(|text| self.wakati(text)).collect()
751 }
752
753 pub fn add_word(&self, surface: &str, reading: &str, pronunciation: &str, wcost: i16) {
777 self.dictionary
778 .add_simple_word(surface, reading, pronunciation, wcost);
779 }
780
781 pub fn remove_word(&self, surface: &str) -> bool {
786 self.dictionary.remove_word(surface)
787 }
788
789 pub fn overlay_size(&self) -> usize {
791 self.dictionary.overlay_size()
792 }
793
794 pub fn parse_nbest(&self, text: &str, n: usize) -> Result<Vec<(AnalysisResult, i64)>> {
808 let lattice = Lattice::build(text, &self.dictionary)?;
810
811 let solver = ViterbiSolver::new(&self.dictionary);
813 let paths = solver.solve_nbest(&lattice, n)?;
814
815 let results = paths
817 .into_iter()
818 .map(|(path, cost)| {
819 let morphemes = path
820 .into_iter()
821 .map(|node| {
822 let entities = if self.semantic_enabled {
823 self.get_entities_for_surface(&node.surface)
824 } else {
825 Vec::new()
826 };
827
828 let pronunciation = if self.ipa_enabled {
829 self.get_ipa_pronunciation(&node.feature)
830 } else {
831 None
832 };
833
834 let embedding = if self.vector_enabled {
835 self.get_embedding(node.word_id)
836 } else {
837 None
838 };
839
840 Morpheme {
841 surface: node.surface,
842 word_id: node.word_id,
843 pos_id: node.pos_id,
844 wcost: node.wcost,
845 feature: node.feature,
846 entities,
847 pronunciation,
848 embedding,
849 }
850 })
851 .collect();
852
853 (
854 AnalysisResult {
855 morphemes,
856 format: self.output_format,
857 },
858 cost,
859 )
860 })
861 .collect();
862
863 Ok(results)
864 }
865}
866
867#[cfg(test)]
868mod tests {
869 use super::*;
870
871 #[test]
872 fn test_builder_default() {
873 let builder = MeCrab::builder();
874 assert!(builder.dicdir.is_none());
875 assert!(builder.userdic.is_none());
876 assert_eq!(builder.output_format, OutputFormat::Default);
877 }
878}