1use crate::Result;
14use regex::Regex;
15use std::collections::HashMap;
16
17#[derive(Debug, Clone, PartialEq, Eq, Hash)]
19pub enum POSTag {
20 Noun,
22 NounPlural,
24 ProperNoun,
26 ProperNounPlural,
28 Verb,
30 VerbPast,
32 VerbGerund,
34 Verb3rdSing,
36 Adjective,
38 Adverb,
40 Preposition,
42 Determiner,
44 Pronoun,
46 Conjunction,
48 Punctuation,
50 Number,
52 Unknown,
54}
55
56impl POSTag {
57 pub fn penn_tag(&self) -> &str {
59 match self {
60 POSTag::Noun => "NN",
61 POSTag::NounPlural => "NNS",
62 POSTag::ProperNoun => "NNP",
63 POSTag::ProperNounPlural => "NNPS",
64 POSTag::Verb => "VB",
65 POSTag::VerbPast => "VBD",
66 POSTag::VerbGerund => "VBG",
67 POSTag::Verb3rdSing => "VBZ",
68 POSTag::Adjective => "JJ",
69 POSTag::Adverb => "RB",
70 POSTag::Preposition => "IN",
71 POSTag::Determiner => "DT",
72 POSTag::Pronoun => "PRP",
73 POSTag::Conjunction => "CC",
74 POSTag::Punctuation => ".",
75 POSTag::Number => "CD",
76 POSTag::Unknown => "UNK",
77 }
78 }
79}
80
81#[derive(Debug, Clone, PartialEq, Eq)]
83pub enum DependencyRelation {
84 Subject,
86 DirectObject,
88 IndirectObject,
90 Modifier,
92 Determiner,
94 PrepositionalModifier,
96 Conjunction,
98 Complement,
100 Root,
102 Unknown,
104}
105
106#[derive(Debug, Clone)]
108pub struct Token {
109 pub text: String,
111 pub position: usize,
113 pub pos: POSTag,
115 pub lemma: String,
117}
118
119#[derive(Debug, Clone)]
121pub struct Dependency {
122 pub head: usize,
124 pub dependent: usize,
126 pub relation: DependencyRelation,
128}
129
130#[derive(Debug, Clone)]
132pub struct NounPhrase {
133 pub tokens: Vec<Token>,
135 pub head_idx: usize,
137 pub text: String,
139}
140
141#[derive(Debug, Clone)]
143pub struct SyntaxAnalyzerConfig {
144 pub enable_pos_tagging: bool,
146 pub enable_dependency_parsing: bool,
148 pub enable_phrase_extraction: bool,
150}
151
152impl Default for SyntaxAnalyzerConfig {
153 fn default() -> Self {
154 Self {
155 enable_pos_tagging: true,
156 enable_dependency_parsing: true,
157 enable_phrase_extraction: true,
158 }
159 }
160}
161
162pub struct SyntaxAnalyzer {
164 #[allow(dead_code)] config: SyntaxAnalyzerConfig,
166 common_nouns: HashMap<String, POSTag>,
168 common_verbs: HashMap<String, POSTag>,
169 common_adjectives: HashMap<String, POSTag>,
170 common_adverbs: HashMap<String, POSTag>,
171 prepositions: HashMap<String, POSTag>,
172 determiners: HashMap<String, POSTag>,
173 pronouns: HashMap<String, POSTag>,
174 conjunctions: HashMap<String, POSTag>,
175}
176
177impl SyntaxAnalyzer {
178 pub fn new(config: SyntaxAnalyzerConfig) -> Self {
180 Self {
181 config,
182 common_nouns: Self::build_noun_dict(),
183 common_verbs: Self::build_verb_dict(),
184 common_adjectives: Self::build_adjective_dict(),
185 common_adverbs: Self::build_adverb_dict(),
186 prepositions: Self::build_preposition_dict(),
187 determiners: Self::build_determiner_dict(),
188 pronouns: Self::build_pronoun_dict(),
189 conjunctions: Self::build_conjunction_dict(),
190 }
191 }
192
193 fn tokenize(&self, text: &str) -> Vec<(String, usize)> {
195 let mut tokens = Vec::new();
196 let mut current_word = String::new();
197 let mut word_start = 0;
198
199 for (i, ch) in text.chars().enumerate() {
200 if ch.is_alphanumeric() || ch == '\'' || ch == '-' {
201 if current_word.is_empty() {
202 word_start = i;
203 }
204 current_word.push(ch);
205 } else {
206 if !current_word.is_empty() {
207 tokens.push((current_word.clone(), word_start));
208 current_word.clear();
209 }
210 if !ch.is_whitespace() {
212 tokens.push((ch.to_string(), i));
213 }
214 }
215 }
216
217 if !current_word.is_empty() {
218 tokens.push((current_word, word_start));
219 }
220
221 tokens
222 }
223
224 pub fn pos_tag(&self, text: &str) -> Result<Vec<Token>> {
226 let raw_tokens = self.tokenize(text);
227 let mut tokens = Vec::new();
228
229 for (word, position) in raw_tokens {
230 let pos = self.tag_word(&word);
231 let lemma = self.lemmatize(&word, &pos);
232
233 tokens.push(Token {
234 text: word,
235 position,
236 pos,
237 lemma,
238 });
239 }
240
241 Ok(tokens)
242 }
243
244 fn tag_word(&self, word: &str) -> POSTag {
246 let lower = word.to_lowercase();
247
248 if word.chars().all(|c| c.is_ascii_punctuation()) {
250 return POSTag::Punctuation;
251 }
252
253 if word.chars().all(|c| c.is_ascii_digit()) {
255 return POSTag::Number;
256 }
257
258 if let Some(pos) = self.determiners.get(&lower) {
260 return pos.clone();
261 }
262 if let Some(pos) = self.pronouns.get(&lower) {
263 return pos.clone();
264 }
265 if let Some(pos) = self.prepositions.get(&lower) {
266 return pos.clone();
267 }
268 if let Some(pos) = self.conjunctions.get(&lower) {
269 return pos.clone();
270 }
271 if let Some(pos) = self.common_adverbs.get(&lower) {
272 return pos.clone();
273 }
274 if let Some(pos) = self.common_verbs.get(&lower) {
275 return pos.clone();
276 }
277 if let Some(pos) = self.common_adjectives.get(&lower) {
278 return pos.clone();
279 }
280 if let Some(pos) = self.common_nouns.get(&lower) {
281 return pos.clone();
282 }
283
284 if word.chars().next().unwrap().is_uppercase() {
287 return POSTag::ProperNoun;
288 }
289
290 if lower.ends_with("ing") {
292 return POSTag::VerbGerund;
293 }
294 if lower.ends_with("ed") {
295 return POSTag::VerbPast;
296 }
297
298 if lower.ends_with('s') && !lower.ends_with("ss") {
300 return POSTag::NounPlural;
301 }
302
303 if lower.ends_with("ive") || lower.ends_with("ous") || lower.ends_with("ful") {
305 return POSTag::Adjective;
306 }
307
308 if lower.ends_with("ly") {
310 return POSTag::Adverb;
311 }
312
313 POSTag::Noun
315 }
316
317 fn lemmatize(&self, word: &str, pos: &POSTag) -> String {
319 let lower = word.to_lowercase();
320
321 match pos {
322 POSTag::NounPlural => {
323 if lower.ends_with("ies") {
325 return format!("{}y", &lower[..lower.len() - 3]);
326 }
327 if lower.ends_with('s') && !lower.ends_with("ss") {
328 return lower[..lower.len() - 1].to_string();
329 }
330 lower
331 },
332 POSTag::VerbPast | POSTag::Verb3rdSing => {
333 if lower.ends_with("ed") {
335 return lower[..lower.len() - 2].to_string();
336 }
337 if lower.ends_with('s') {
338 return lower[..lower.len() - 1].to_string();
339 }
340 lower
341 },
342 POSTag::VerbGerund => {
343 if lower.ends_with("ing") {
345 return lower[..lower.len() - 3].to_string();
346 }
347 lower
348 },
349 _ => lower,
350 }
351 }
352
353 pub fn parse_dependencies(&self, tokens: &[Token]) -> Result<Vec<Dependency>> {
355 let mut dependencies = Vec::new();
356
357 if tokens.is_empty() {
358 return Ok(dependencies);
359 }
360
361 let root_idx = tokens
363 .iter()
364 .position(|t| matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing))
365 .unwrap_or(0);
366
367 #[allow(clippy::needless_range_loop)]
369 for i in 0..root_idx {
370 if matches!(
371 tokens[i].pos,
372 POSTag::Noun | POSTag::ProperNoun | POSTag::Pronoun
373 ) {
374 dependencies.push(Dependency {
375 head: root_idx,
376 dependent: i,
377 relation: DependencyRelation::Subject,
378 });
379 break;
380 }
381 }
382
383 #[allow(clippy::needless_range_loop)]
385 for i in (root_idx + 1)..tokens.len() {
386 if matches!(tokens[i].pos, POSTag::Noun | POSTag::ProperNoun) {
387 dependencies.push(Dependency {
388 head: root_idx,
389 dependent: i,
390 relation: DependencyRelation::DirectObject,
391 });
392 break;
393 }
394 }
395
396 for i in 0..tokens.len() {
398 match tokens[i].pos {
399 POSTag::Adjective => {
400 if let Some(noun_idx) = tokens[i + 1..]
402 .iter()
403 .position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
404 {
405 dependencies.push(Dependency {
406 head: i + 1 + noun_idx,
407 dependent: i,
408 relation: DependencyRelation::Modifier,
409 });
410 }
411 },
412 POSTag::Adverb => {
413 let verb_idx = tokens.iter().position(|t| {
415 matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing)
416 });
417 if let Some(v_idx) = verb_idx {
418 dependencies.push(Dependency {
419 head: v_idx,
420 dependent: i,
421 relation: DependencyRelation::Modifier,
422 });
423 }
424 },
425 POSTag::Determiner => {
426 if let Some(noun_idx) = tokens[i + 1..]
428 .iter()
429 .position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
430 {
431 dependencies.push(Dependency {
432 head: i + 1 + noun_idx,
433 dependent: i,
434 relation: DependencyRelation::Determiner,
435 });
436 }
437 },
438 _ => {},
439 }
440 }
441
442 Ok(dependencies)
443 }
444
445 pub fn extract_noun_phrases(&self, tokens: &[Token]) -> Result<Vec<NounPhrase>> {
447 let mut phrases = Vec::new();
448 let mut current_phrase: Vec<Token> = Vec::new();
449 let mut head_idx = 0;
450
451 for token in tokens {
452 match token.pos {
453 POSTag::Determiner | POSTag::Adjective => {
454 current_phrase.push(token.clone());
456 },
457 POSTag::Noun
458 | POSTag::ProperNoun
459 | POSTag::NounPlural
460 | POSTag::ProperNounPlural => {
461 head_idx = current_phrase.len();
463 current_phrase.push(token.clone());
464 },
465 _ => {
466 if !current_phrase.is_empty() {
468 let text = current_phrase
469 .iter()
470 .map(|t| t.text.as_str())
471 .collect::<Vec<_>>()
472 .join(" ");
473
474 phrases.push(NounPhrase {
475 tokens: current_phrase.clone(),
476 head_idx,
477 text,
478 });
479
480 current_phrase.clear();
481 head_idx = 0;
482 }
483 },
484 }
485 }
486
487 if !current_phrase.is_empty() {
489 let text = current_phrase
490 .iter()
491 .map(|t| t.text.as_str())
492 .collect::<Vec<_>>()
493 .join(" ");
494
495 phrases.push(NounPhrase {
496 tokens: current_phrase,
497 head_idx,
498 text,
499 });
500 }
501
502 Ok(phrases)
503 }
504
505 pub fn segment_sentences(&self, text: &str) -> Vec<String> {
507 let sentence_regex = Regex::new(r"[.!?]+\s+").unwrap();
508 sentence_regex
509 .split(text)
510 .map(|s| s.trim().to_string())
511 .filter(|s| !s.is_empty())
512 .collect()
513 }
514
515 fn build_noun_dict() -> HashMap<String, POSTag> {
517 let nouns = vec![
518 "time",
519 "person",
520 "year",
521 "way",
522 "day",
523 "thing",
524 "man",
525 "world",
526 "life",
527 "hand",
528 "part",
529 "child",
530 "eye",
531 "woman",
532 "place",
533 "work",
534 "week",
535 "case",
536 "point",
537 "government",
538 "company",
539 "number",
540 "group",
541 "problem",
542 "fact",
543 ];
544 nouns
545 .into_iter()
546 .map(|s| (s.to_string(), POSTag::Noun))
547 .collect()
548 }
549
550 fn build_verb_dict() -> HashMap<String, POSTag> {
551 let verbs = vec![
552 "be", "have", "do", "say", "get", "make", "go", "know", "take", "see", "come", "think",
553 "look", "want", "give", "use", "find", "tell", "ask", "work", "seem", "feel", "try",
554 "leave", "call",
555 ];
556 verbs
557 .into_iter()
558 .map(|s| (s.to_string(), POSTag::Verb))
559 .collect()
560 }
561
562 fn build_adjective_dict() -> HashMap<String, POSTag> {
563 let adjectives = vec![
564 "good",
565 "new",
566 "first",
567 "last",
568 "long",
569 "great",
570 "little",
571 "own",
572 "other",
573 "old",
574 "right",
575 "big",
576 "high",
577 "different",
578 "small",
579 "large",
580 "next",
581 "early",
582 "young",
583 "important",
584 "few",
585 "public",
586 "bad",
587 "same",
588 "able",
589 ];
590 adjectives
591 .into_iter()
592 .map(|s| (s.to_string(), POSTag::Adjective))
593 .collect()
594 }
595
596 fn build_adverb_dict() -> HashMap<String, POSTag> {
597 let adverbs = vec![
598 "not", "so", "out", "up", "now", "only", "just", "more", "also", "very", "well",
599 "back", "there", "even", "still", "too", "here", "then", "always", "never", "often",
600 "quite", "really", "almost", "again",
601 ];
602 adverbs
603 .into_iter()
604 .map(|s| (s.to_string(), POSTag::Adverb))
605 .collect()
606 }
607
608 fn build_preposition_dict() -> HashMap<String, POSTag> {
609 let prepositions = vec![
610 "of", "in", "to", "for", "with", "on", "at", "from", "by", "about", "into", "through",
611 "during", "before", "after", "above", "below", "between", "under", "since", "without",
612 "within", "along", "among", "across",
613 ];
614 prepositions
615 .into_iter()
616 .map(|s| (s.to_string(), POSTag::Preposition))
617 .collect()
618 }
619
620 fn build_determiner_dict() -> HashMap<String, POSTag> {
621 let determiners = vec![
622 "the", "a", "an", "this", "that", "these", "those", "my", "your", "his", "her", "its",
623 "our", "their", "all", "both", "each", "every", "some", "any", "no", "another", "such",
624 "what", "which",
625 ];
626 determiners
627 .into_iter()
628 .map(|s| (s.to_string(), POSTag::Determiner))
629 .collect()
630 }
631
632 fn build_pronoun_dict() -> HashMap<String, POSTag> {
633 let pronouns = vec![
634 "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", "who",
635 "whom", "what", "which", "this", "that",
636 ];
637 pronouns
638 .into_iter()
639 .map(|s| (s.to_string(), POSTag::Pronoun))
640 .collect()
641 }
642
643 fn build_conjunction_dict() -> HashMap<String, POSTag> {
644 let conjunctions = vec![
645 "and", "or", "but", "nor", "yet", "so", "for", "because", "although", "though",
646 "while", "if", "unless", "until", "when", "where",
647 ];
648 conjunctions
649 .into_iter()
650 .map(|s| (s.to_string(), POSTag::Conjunction))
651 .collect()
652 }
653}
654
655#[cfg(test)]
656mod tests {
657 use super::*;
658
659 #[test]
660 fn test_pos_tagging() {
661 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
662 let text = "The good brown fox jumps over the lazy dog.";
663
664 let tokens = analyzer.pos_tag(text).unwrap();
665
666 assert!(!tokens.is_empty());
667
668 assert_eq!(tokens[0].pos, POSTag::Determiner); assert_eq!(tokens[1].pos, POSTag::Adjective); assert!(matches!(tokens[3].pos, POSTag::Noun | POSTag::ProperNoun)); assert!(tokens.iter().any(|t| t.text == "jumps"));
674 }
675
676 #[test]
677 fn test_lemmatization() {
678 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
679
680 assert_eq!(analyzer.lemmatize("running", &POSTag::VerbGerund), "runn");
681 assert_eq!(analyzer.lemmatize("cats", &POSTag::NounPlural), "cat");
682 assert_eq!(analyzer.lemmatize("jumped", &POSTag::VerbPast), "jump");
683 }
684
685 #[test]
686 fn test_noun_phrase_extraction() {
687 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
688 let text = "The quick brown fox";
689
690 let tokens = analyzer.pos_tag(text).unwrap();
691 let phrases = analyzer.extract_noun_phrases(&tokens).unwrap();
692
693 assert_eq!(phrases.len(), 1);
694 assert_eq!(phrases[0].text, "The quick brown fox");
695 }
696
697 #[test]
698 fn test_dependency_parsing() {
699 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
700 let text = "The cat chased the mouse";
701
702 let tokens = analyzer.pos_tag(text).unwrap();
703 let deps = analyzer.parse_dependencies(&tokens).unwrap();
704
705 assert!(!deps.is_empty());
707
708 let has_subject = deps
710 .iter()
711 .any(|d| matches!(d.relation, DependencyRelation::Subject));
712 assert!(has_subject, "Should have subject dependency");
713 }
714
715 #[test]
716 fn test_sentence_segmentation() {
717 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
718 let text = "This is sentence one. This is sentence two! And sentence three?";
719
720 let sentences = analyzer.segment_sentences(text);
721
722 assert_eq!(sentences.len(), 3);
723 assert!(sentences[0].contains("sentence one"));
724 assert!(sentences[1].contains("sentence two"));
725 assert!(sentences[2].contains("sentence three"));
726 }
727
728 #[test]
729 fn test_tokenization() {
730 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
731 let text = "Hello, world!";
732
733 let tokens = analyzer.tokenize(text);
734
735 assert_eq!(tokens.len(), 4); assert_eq!(tokens[0].0, "Hello");
737 assert_eq!(tokens[1].0, ",");
738 }
739
740 #[test]
741 fn test_proper_noun_detection() {
742 let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
743 let text = "John Smith lives in New York";
744
745 let tokens = analyzer.pos_tag(text).unwrap();
746
747 let proper_nouns: Vec<_> = tokens
749 .iter()
750 .filter(|t| matches!(t.pos, POSTag::ProperNoun))
751 .collect();
752
753 assert!(!proper_nouns.is_empty());
754 }
755}