1use crate::Result;
14use regex::Regex;
15use std::collections::HashMap;
16
17#[derive(Debug, Clone, PartialEq, Eq, Hash)]
19pub enum POSTag {
20 Noun,
22 NounPlural,
24 ProperNoun,
26 ProperNounPlural,
28 Verb,
30 VerbPast,
32 VerbGerund,
34 Verb3rdSing,
36 Adjective,
38 Adverb,
40 Preposition,
42 Determiner,
44 Pronoun,
46 Conjunction,
48 Punctuation,
50 Number,
52 Unknown,
54}
55
56impl POSTag {
57 pub fn penn_tag(&self) -> &str {
59 match self {
60 POSTag::Noun => "NN",
61 POSTag::NounPlural => "NNS",
62 POSTag::ProperNoun => "NNP",
63 POSTag::ProperNounPlural => "NNPS",
64 POSTag::Verb => "VB",
65 POSTag::VerbPast => "VBD",
66 POSTag::VerbGerund => "VBG",
67 POSTag::Verb3rdSing => "VBZ",
68 POSTag::Adjective => "JJ",
69 POSTag::Adverb => "RB",
70 POSTag::Preposition => "IN",
71 POSTag::Determiner => "DT",
72 POSTag::Pronoun => "PRP",
73 POSTag::Conjunction => "CC",
74 POSTag::Punctuation => ".",
75 POSTag::Number => "CD",
76 POSTag::Unknown => "UNK",
77 }
78 }
79}
80
81#[derive(Debug, Clone, PartialEq, Eq)]
83pub enum DependencyRelation {
84 Subject,
86 DirectObject,
88 IndirectObject,
90 Modifier,
92 Determiner,
94 PrepositionalModifier,
96 Conjunction,
98 Complement,
100 Root,
102 Unknown,
104}
105
106#[derive(Debug, Clone)]
108pub struct Token {
109 pub text: String,
111 pub position: usize,
113 pub pos: POSTag,
115 pub lemma: String,
117}
118
119#[derive(Debug, Clone)]
121pub struct Dependency {
122 pub head: usize,
124 pub dependent: usize,
126 pub relation: DependencyRelation,
128}
129
130#[derive(Debug, Clone)]
132pub struct NounPhrase {
133 pub tokens: Vec<Token>,
135 pub head_idx: usize,
137 pub text: String,
139}
140
141#[derive(Debug, Clone)]
143pub struct SyntaxAnalyzerConfig {
144 pub enable_pos_tagging: bool,
146 pub enable_dependency_parsing: bool,
148 pub enable_phrase_extraction: bool,
150}
151
152impl Default for SyntaxAnalyzerConfig {
153 fn default() -> Self {
154 Self {
155 enable_pos_tagging: true,
156 enable_dependency_parsing: true,
157 enable_phrase_extraction: true,
158 }
159 }
160}
161
162pub struct SyntaxAnalyzer {
164 #[allow(dead_code)] config: SyntaxAnalyzerConfig,
166 common_nouns: HashMap<String, POSTag>,
168 common_verbs: HashMap<String, POSTag>,
169 common_adjectives: HashMap<String, POSTag>,
170 common_adverbs: HashMap<String, POSTag>,
171 prepositions: HashMap<String, POSTag>,
172 determiners: HashMap<String, POSTag>,
173 pronouns: HashMap<String, POSTag>,
174 conjunctions: HashMap<String, POSTag>,
175}
176
177impl SyntaxAnalyzer {
178 pub fn new(config: SyntaxAnalyzerConfig) -> Self {
180 Self {
181 config,
182 common_nouns: Self::build_noun_dict(),
183 common_verbs: Self::build_verb_dict(),
184 common_adjectives: Self::build_adjective_dict(),
185 common_adverbs: Self::build_adverb_dict(),
186 prepositions: Self::build_preposition_dict(),
187 determiners: Self::build_determiner_dict(),
188 pronouns: Self::build_pronoun_dict(),
189 conjunctions: Self::build_conjunction_dict(),
190 }
191 }
192
193 fn tokenize(&self, text: &str) -> Vec<(String, usize)> {
195 let mut tokens = Vec::new();
196 let mut current_word = String::new();
197 let mut word_start = 0;
198
199 for (i, ch) in text.chars().enumerate() {
200 if ch.is_alphanumeric() || ch == '\'' || ch == '-' {
201 if current_word.is_empty() {
202 word_start = i;
203 }
204 current_word.push(ch);
205 } else {
206 if !current_word.is_empty() {
207 tokens.push((current_word.clone(), word_start));
208 current_word.clear();
209 }
210 if !ch.is_whitespace() {
212 tokens.push((ch.to_string(), i));
213 }
214 }
215 }
216
217 if !current_word.is_empty() {
218 tokens.push((current_word, word_start));
219 }
220
221 tokens
222 }
223
224 pub fn pos_tag(&self, text: &str) -> Result<Vec<Token>> {
226 let raw_tokens = self.tokenize(text);
227 let mut tokens = Vec::new();
228
229 for (word, position) in raw_tokens {
230 let pos = self.tag_word(&word);
231 let lemma = self.lemmatize(&word, &pos);
232
233 tokens.push(Token {
234 text: word,
235 position,
236 pos,
237 lemma,
238 });
239 }
240
241 Ok(tokens)
242 }
243
244 fn tag_word(&self, word: &str) -> POSTag {
246 let lower = word.to_lowercase();
247
248 if word.chars().all(|c| c.is_ascii_punctuation()) {
250 return POSTag::Punctuation;
251 }
252
253 if word.chars().all(|c| c.is_ascii_digit()) {
255 return POSTag::Number;
256 }
257
258 if let Some(pos) = self.determiners.get(&lower) {
260 return pos.clone();
261 }
262 if let Some(pos) = self.pronouns.get(&lower) {
263 return pos.clone();
264 }
265 if let Some(pos) = self.prepositions.get(&lower) {
266 return pos.clone();
267 }
268 if let Some(pos) = self.conjunctions.get(&lower) {
269 return pos.clone();
270 }
271 if let Some(pos) = self.common_adverbs.get(&lower) {
272 return pos.clone();
273 }
274 if let Some(pos) = self.common_verbs.get(&lower) {
275 return pos.clone();
276 }
277 if let Some(pos) = self.common_adjectives.get(&lower) {
278 return pos.clone();
279 }
280 if let Some(pos) = self.common_nouns.get(&lower) {
281 return pos.clone();
282 }
283
284 if word
287 .chars()
288 .next()
289 .expect("non-empty string")
290 .is_uppercase()
291 {
292 return POSTag::ProperNoun;
293 }
294
295 if lower.ends_with("ing") {
297 return POSTag::VerbGerund;
298 }
299 if lower.ends_with("ed") {
300 return POSTag::VerbPast;
301 }
302
303 if lower.ends_with('s') && !lower.ends_with("ss") {
305 return POSTag::NounPlural;
306 }
307
308 if lower.ends_with("ive") || lower.ends_with("ous") || lower.ends_with("ful") {
310 return POSTag::Adjective;
311 }
312
313 if lower.ends_with("ly") {
315 return POSTag::Adverb;
316 }
317
318 POSTag::Noun
320 }
321
322 fn lemmatize(&self, word: &str, pos: &POSTag) -> String {
324 let lower = word.to_lowercase();
325
326 match pos {
327 POSTag::NounPlural => {
328 if lower.ends_with("ies") {
330 return format!("{}y", &lower[..lower.len() - 3]);
331 }
332 if lower.ends_with('s') && !lower.ends_with("ss") {
333 return lower[..lower.len() - 1].to_string();
334 }
335 lower
336 },
337 POSTag::VerbPast | POSTag::Verb3rdSing => {
338 if lower.ends_with("ed") {
340 return lower[..lower.len() - 2].to_string();
341 }
342 if lower.ends_with('s') {
343 return lower[..lower.len() - 1].to_string();
344 }
345 lower
346 },
347 POSTag::VerbGerund => {
348 if lower.ends_with("ing") {
350 return lower[..lower.len() - 3].to_string();
351 }
352 lower
353 },
354 _ => lower,
355 }
356 }
357
358 pub fn parse_dependencies(&self, tokens: &[Token]) -> Result<Vec<Dependency>> {
360 let mut dependencies = Vec::new();
361
362 if tokens.is_empty() {
363 return Ok(dependencies);
364 }
365
366 let root_idx = tokens
368 .iter()
369 .position(|t| matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing))
370 .unwrap_or(0);
371
372 #[allow(clippy::needless_range_loop)]
374 for i in 0..root_idx {
375 if matches!(
376 tokens[i].pos,
377 POSTag::Noun | POSTag::ProperNoun | POSTag::Pronoun
378 ) {
379 dependencies.push(Dependency {
380 head: root_idx,
381 dependent: i,
382 relation: DependencyRelation::Subject,
383 });
384 break;
385 }
386 }
387
388 #[allow(clippy::needless_range_loop)]
390 for i in (root_idx + 1)..tokens.len() {
391 if matches!(tokens[i].pos, POSTag::Noun | POSTag::ProperNoun) {
392 dependencies.push(Dependency {
393 head: root_idx,
394 dependent: i,
395 relation: DependencyRelation::DirectObject,
396 });
397 break;
398 }
399 }
400
401 for i in 0..tokens.len() {
403 match tokens[i].pos {
404 POSTag::Adjective => {
405 if let Some(noun_idx) = tokens[i + 1..]
407 .iter()
408 .position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
409 {
410 dependencies.push(Dependency {
411 head: i + 1 + noun_idx,
412 dependent: i,
413 relation: DependencyRelation::Modifier,
414 });
415 }
416 },
417 POSTag::Adverb => {
418 let verb_idx = tokens.iter().position(|t| {
420 matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing)
421 });
422 if let Some(v_idx) = verb_idx {
423 dependencies.push(Dependency {
424 head: v_idx,
425 dependent: i,
426 relation: DependencyRelation::Modifier,
427 });
428 }
429 },
430 POSTag::Determiner => {
431 if let Some(noun_idx) = tokens[i + 1..]
433 .iter()
434 .position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
435 {
436 dependencies.push(Dependency {
437 head: i + 1 + noun_idx,
438 dependent: i,
439 relation: DependencyRelation::Determiner,
440 });
441 }
442 },
443 _ => {},
444 }
445 }
446
447 Ok(dependencies)
448 }
449
450 pub fn extract_noun_phrases(&self, tokens: &[Token]) -> Result<Vec<NounPhrase>> {
452 let mut phrases = Vec::new();
453 let mut current_phrase: Vec<Token> = Vec::new();
454 let mut head_idx = 0;
455
456 for token in tokens {
457 match token.pos {
458 POSTag::Determiner | POSTag::Adjective => {
459 current_phrase.push(token.clone());
461 },
462 POSTag::Noun
463 | POSTag::ProperNoun
464 | POSTag::NounPlural
465 | POSTag::ProperNounPlural => {
466 head_idx = current_phrase.len();
468 current_phrase.push(token.clone());
469 },
470 _ => {
471 if !current_phrase.is_empty() {
473 let text = current_phrase
474 .iter()
475 .map(|t| t.text.as_str())
476 .collect::<Vec<_>>()
477 .join(" ");
478
479 phrases.push(NounPhrase {
480 tokens: current_phrase.clone(),
481 head_idx,
482 text,
483 });
484
485 current_phrase.clear();
486 head_idx = 0;
487 }
488 },
489 }
490 }
491
492 if !current_phrase.is_empty() {
494 let text = current_phrase
495 .iter()
496 .map(|t| t.text.as_str())
497 .collect::<Vec<_>>()
498 .join(" ");
499
500 phrases.push(NounPhrase {
501 tokens: current_phrase,
502 head_idx,
503 text,
504 });
505 }
506
507 Ok(phrases)
508 }
509
510 pub fn segment_sentences(&self, text: &str) -> Vec<String> {
512 let sentence_regex = Regex::new(r"[.!?]+\s+").expect("static regex literal");
513 sentence_regex
514 .split(text)
515 .map(|s| s.trim().to_string())
516 .filter(|s| !s.is_empty())
517 .collect()
518 }
519
520 fn build_noun_dict() -> HashMap<String, POSTag> {
522 let nouns = vec![
523 "time",
524 "person",
525 "year",
526 "way",
527 "day",
528 "thing",
529 "man",
530 "world",
531 "life",
532 "hand",
533 "part",
534 "child",
535 "eye",
536 "woman",
537 "place",
538 "work",
539 "week",
540 "case",
541 "point",
542 "government",
543 "company",
544 "number",
545 "group",
546 "problem",
547 "fact",
548 ];
549 nouns
550 .into_iter()
551 .map(|s| (s.to_string(), POSTag::Noun))
552 .collect()
553 }
554
555 fn build_verb_dict() -> HashMap<String, POSTag> {
556 let verbs = vec![
557 "be", "have", "do", "say", "get", "make", "go", "know", "take", "see", "come", "think",
558 "look", "want", "give", "use", "find", "tell", "ask", "work", "seem", "feel", "try",
559 "leave", "call",
560 ];
561 verbs
562 .into_iter()
563 .map(|s| (s.to_string(), POSTag::Verb))
564 .collect()
565 }
566
567 fn build_adjective_dict() -> HashMap<String, POSTag> {
568 let adjectives = vec![
569 "good",
570 "new",
571 "first",
572 "last",
573 "long",
574 "great",
575 "little",
576 "own",
577 "other",
578 "old",
579 "right",
580 "big",
581 "high",
582 "different",
583 "small",
584 "large",
585 "next",
586 "early",
587 "young",
588 "important",
589 "few",
590 "public",
591 "bad",
592 "same",
593 "able",
594 ];
595 adjectives
596 .into_iter()
597 .map(|s| (s.to_string(), POSTag::Adjective))
598 .collect()
599 }
600
601 fn build_adverb_dict() -> HashMap<String, POSTag> {
602 let adverbs = vec![
603 "not", "so", "out", "up", "now", "only", "just", "more", "also", "very", "well",
604 "back", "there", "even", "still", "too", "here", "then", "always", "never", "often",
605 "quite", "really", "almost", "again",
606 ];
607 adverbs
608 .into_iter()
609 .map(|s| (s.to_string(), POSTag::Adverb))
610 .collect()
611 }
612
613 fn build_preposition_dict() -> HashMap<String, POSTag> {
614 let prepositions = vec![
615 "of", "in", "to", "for", "with", "on", "at", "from", "by", "about", "into", "through",
616 "during", "before", "after", "above", "below", "between", "under", "since", "without",
617 "within", "along", "among", "across",
618 ];
619 prepositions
620 .into_iter()
621 .map(|s| (s.to_string(), POSTag::Preposition))
622 .collect()
623 }
624
625 fn build_determiner_dict() -> HashMap<String, POSTag> {
626 let determiners = vec![
627 "the", "a", "an", "this", "that", "these", "those", "my", "your", "his", "her", "its",
628 "our", "their", "all", "both", "each", "every", "some", "any", "no", "another", "such",
629 "what", "which",
630 ];
631 determiners
632 .into_iter()
633 .map(|s| (s.to_string(), POSTag::Determiner))
634 .collect()
635 }
636
637 fn build_pronoun_dict() -> HashMap<String, POSTag> {
638 let pronouns = vec![
639 "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", "who",
640 "whom", "what", "which", "this", "that",
641 ];
642 pronouns
643 .into_iter()
644 .map(|s| (s.to_string(), POSTag::Pronoun))
645 .collect()
646 }
647
648 fn build_conjunction_dict() -> HashMap<String, POSTag> {
649 let conjunctions = vec![
650 "and", "or", "but", "nor", "yet", "so", "for", "because", "although", "though",
651 "while", "if", "unless", "until", "when", "where",
652 ];
653 conjunctions
654 .into_iter()
655 .map(|s| (s.to_string(), POSTag::Conjunction))
656 .collect()
657 }
658}
659
660#[cfg(test)]
661mod tests {
662 use super::*;
663
664 #[test]
665 fn test_pos_tagging() {
666 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
667 let text = "The good brown fox jumps over the lazy dog.";
668
669 let tokens = analyzer.pos_tag(text).unwrap();
670
671 assert!(!tokens.is_empty());
672
673 assert_eq!(tokens[0].pos, POSTag::Determiner); assert_eq!(tokens[1].pos, POSTag::Adjective); assert!(matches!(tokens[3].pos, POSTag::Noun | POSTag::ProperNoun)); assert!(tokens.iter().any(|t| t.text == "jumps"));
679 }
680
681 #[test]
682 fn test_lemmatization() {
683 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
684
685 assert_eq!(analyzer.lemmatize("running", &POSTag::VerbGerund), "runn");
686 assert_eq!(analyzer.lemmatize("cats", &POSTag::NounPlural), "cat");
687 assert_eq!(analyzer.lemmatize("jumped", &POSTag::VerbPast), "jump");
688 }
689
690 #[test]
691 fn test_noun_phrase_extraction() {
692 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
693 let text = "The quick brown fox";
694
695 let tokens = analyzer.pos_tag(text).unwrap();
696 let phrases = analyzer.extract_noun_phrases(&tokens).unwrap();
697
698 assert_eq!(phrases.len(), 1);
699 assert_eq!(phrases[0].text, "The quick brown fox");
700 }
701
702 #[test]
703 fn test_dependency_parsing() {
704 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
705 let text = "The cat chased the mouse";
706
707 let tokens = analyzer.pos_tag(text).unwrap();
708 let deps = analyzer.parse_dependencies(&tokens).unwrap();
709
710 assert!(!deps.is_empty());
712
713 let has_subject = deps
715 .iter()
716 .any(|d| matches!(d.relation, DependencyRelation::Subject));
717 assert!(has_subject, "Should have subject dependency");
718 }
719
720 #[test]
721 fn test_sentence_segmentation() {
722 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
723 let text = "This is sentence one. This is sentence two! And sentence three?";
724
725 let sentences = analyzer.segment_sentences(text);
726
727 assert_eq!(sentences.len(), 3);
728 assert!(sentences[0].contains("sentence one"));
729 assert!(sentences[1].contains("sentence two"));
730 assert!(sentences[2].contains("sentence three"));
731 }
732
733 #[test]
734 fn test_tokenization() {
735 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
736 let text = "Hello, world!";
737
738 let tokens = analyzer.tokenize(text);
739
740 assert_eq!(tokens.len(), 4); assert_eq!(tokens[0].0, "Hello");
742 assert_eq!(tokens[1].0, ",");
743 }
744
745 #[test]
746 fn test_proper_noun_detection() {
747 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
748 let text = "John Smith lives in New York";
749
750 let tokens = analyzer.pos_tag(text).unwrap();
751
752 let proper_nouns: Vec<_> = tokens
754 .iter()
755 .filter(|t| matches!(t.pos, POSTag::ProperNoun))
756 .collect();
757
758 assert!(!proper_nouns.is_empty());
759 }
760}