Skip to main content

graphrag_core/nlp/
syntax_analyzer.rs

1//! Rule-based Syntax Analysis
2//!
3//! Deterministic POS tagging and dependency parsing without ML models.
4//! Lightweight implementation using pattern matching and linguistic rules.
5//!
6//! Features:
7//! - Part-of-Speech (POS) tagging
8//! - Dependency parsing (simplified)
9//! - Phrase extraction (noun phrases, verb phrases)
10//! - Sentence segmentation
11//! - Token classification
12
13use crate::Result;
14use regex::Regex;
15use std::collections::HashMap;
16
17/// Part-of-Speech tag
18#[derive(Debug, Clone, PartialEq, Eq, Hash)]
19pub enum POSTag {
20    /// Noun (singular or mass)
21    Noun,
22    /// Noun, plural
23    NounPlural,
24    /// Proper noun, singular
25    ProperNoun,
26    /// Proper noun, plural
27    ProperNounPlural,
28    /// Verb, base form
29    Verb,
30    /// Verb, past tense
31    VerbPast,
32    /// Verb, gerund or present participle
33    VerbGerund,
34    /// Verb, 3rd person singular present
35    Verb3rdSing,
36    /// Adjective
37    Adjective,
38    /// Adverb
39    Adverb,
40    /// Preposition or subordinating conjunction
41    Preposition,
42    /// Determiner
43    Determiner,
44    /// Pronoun
45    Pronoun,
46    /// Conjunction, coordinating
47    Conjunction,
48    /// Punctuation
49    Punctuation,
50    /// Number
51    Number,
52    /// Unknown/Other
53    Unknown,
54}
55
56impl POSTag {
57    /// Get Penn Treebank tag string
58    pub fn penn_tag(&self) -> &str {
59        match self {
60            POSTag::Noun => "NN",
61            POSTag::NounPlural => "NNS",
62            POSTag::ProperNoun => "NNP",
63            POSTag::ProperNounPlural => "NNPS",
64            POSTag::Verb => "VB",
65            POSTag::VerbPast => "VBD",
66            POSTag::VerbGerund => "VBG",
67            POSTag::Verb3rdSing => "VBZ",
68            POSTag::Adjective => "JJ",
69            POSTag::Adverb => "RB",
70            POSTag::Preposition => "IN",
71            POSTag::Determiner => "DT",
72            POSTag::Pronoun => "PRP",
73            POSTag::Conjunction => "CC",
74            POSTag::Punctuation => ".",
75            POSTag::Number => "CD",
76            POSTag::Unknown => "UNK",
77        }
78    }
79}
80
81/// Dependency relation type
82#[derive(Debug, Clone, PartialEq, Eq)]
83pub enum DependencyRelation {
84    /// Subject of a verb
85    Subject,
86    /// Direct object of a verb
87    DirectObject,
88    /// Indirect object
89    IndirectObject,
90    /// Modifier (adjective/adverb)
91    Modifier,
92    /// Determiner
93    Determiner,
94    /// Prepositional modifier
95    PrepositionalModifier,
96    /// Conjunction
97    Conjunction,
98    /// Complement
99    Complement,
100    /// Root of the sentence
101    Root,
102    /// Unknown relation
103    Unknown,
104}
105
106/// A token with POS tag
107#[derive(Debug, Clone)]
108pub struct Token {
109    /// The text of the token
110    pub text: String,
111    /// Position in the original text
112    pub position: usize,
113    /// POS tag
114    pub pos: POSTag,
115    /// Lemma (base form)
116    pub lemma: String,
117}
118
119/// A dependency arc between tokens
120#[derive(Debug, Clone)]
121pub struct Dependency {
122    /// Index of the head token
123    pub head: usize,
124    /// Index of the dependent token
125    pub dependent: usize,
126    /// Type of dependency relation
127    pub relation: DependencyRelation,
128}
129
130/// A noun phrase
131#[derive(Debug, Clone)]
132pub struct NounPhrase {
133    /// Tokens in the noun phrase
134    pub tokens: Vec<Token>,
135    /// Head noun index within tokens
136    pub head_idx: usize,
137    /// Text span
138    pub text: String,
139}
140
141/// Configuration for syntax analyzer
142#[derive(Debug, Clone)]
143pub struct SyntaxAnalyzerConfig {
144    /// Enable POS tagging
145    pub enable_pos_tagging: bool,
146    /// Enable dependency parsing
147    pub enable_dependency_parsing: bool,
148    /// Enable phrase extraction
149    pub enable_phrase_extraction: bool,
150}
151
152impl Default for SyntaxAnalyzerConfig {
153    fn default() -> Self {
154        Self {
155            enable_pos_tagging: true,
156            enable_dependency_parsing: true,
157            enable_phrase_extraction: true,
158        }
159    }
160}
161
162/// Rule-based syntax analyzer
163pub struct SyntaxAnalyzer {
164    #[allow(dead_code)] // Reserved for future conditional feature flags
165    config: SyntaxAnalyzerConfig,
166    // Lookup tables for POS tagging
167    common_nouns: HashMap<String, POSTag>,
168    common_verbs: HashMap<String, POSTag>,
169    common_adjectives: HashMap<String, POSTag>,
170    common_adverbs: HashMap<String, POSTag>,
171    prepositions: HashMap<String, POSTag>,
172    determiners: HashMap<String, POSTag>,
173    pronouns: HashMap<String, POSTag>,
174    conjunctions: HashMap<String, POSTag>,
175}
176
177impl SyntaxAnalyzer {
178    /// Create a new syntax analyzer
179    pub fn new(config: SyntaxAnalyzerConfig) -> Self {
180        Self {
181            config,
182            common_nouns: Self::build_noun_dict(),
183            common_verbs: Self::build_verb_dict(),
184            common_adjectives: Self::build_adjective_dict(),
185            common_adverbs: Self::build_adverb_dict(),
186            prepositions: Self::build_preposition_dict(),
187            determiners: Self::build_determiner_dict(),
188            pronouns: Self::build_pronoun_dict(),
189            conjunctions: Self::build_conjunction_dict(),
190        }
191    }
192
193    /// Tokenize text into words
194    fn tokenize(&self, text: &str) -> Vec<(String, usize)> {
195        let mut tokens = Vec::new();
196        let mut current_word = String::new();
197        let mut word_start = 0;
198
199        for (i, ch) in text.chars().enumerate() {
200            if ch.is_alphanumeric() || ch == '\'' || ch == '-' {
201                if current_word.is_empty() {
202                    word_start = i;
203                }
204                current_word.push(ch);
205            } else {
206                if !current_word.is_empty() {
207                    tokens.push((current_word.clone(), word_start));
208                    current_word.clear();
209                }
210                // Add punctuation as separate tokens
211                if !ch.is_whitespace() {
212                    tokens.push((ch.to_string(), i));
213                }
214            }
215        }
216
217        if !current_word.is_empty() {
218            tokens.push((current_word, word_start));
219        }
220
221        tokens
222    }
223
224    /// Perform POS tagging on text
225    pub fn pos_tag(&self, text: &str) -> Result<Vec<Token>> {
226        let raw_tokens = self.tokenize(text);
227        let mut tokens = Vec::new();
228
229        for (word, position) in raw_tokens {
230            let pos = self.tag_word(&word);
231            let lemma = self.lemmatize(&word, &pos);
232
233            tokens.push(Token {
234                text: word,
235                position,
236                pos,
237                lemma,
238            });
239        }
240
241        Ok(tokens)
242    }
243
244    /// Tag a single word with POS
245    fn tag_word(&self, word: &str) -> POSTag {
246        let lower = word.to_lowercase();
247
248        // Check punctuation
249        if word.chars().all(|c| c.is_ascii_punctuation()) {
250            return POSTag::Punctuation;
251        }
252
253        // Check numbers
254        if word.chars().all(|c| c.is_ascii_digit()) {
255            return POSTag::Number;
256        }
257
258        // Check dictionaries
259        if let Some(pos) = self.determiners.get(&lower) {
260            return pos.clone();
261        }
262        if let Some(pos) = self.pronouns.get(&lower) {
263            return pos.clone();
264        }
265        if let Some(pos) = self.prepositions.get(&lower) {
266            return pos.clone();
267        }
268        if let Some(pos) = self.conjunctions.get(&lower) {
269            return pos.clone();
270        }
271        if let Some(pos) = self.common_adverbs.get(&lower) {
272            return pos.clone();
273        }
274        if let Some(pos) = self.common_verbs.get(&lower) {
275            return pos.clone();
276        }
277        if let Some(pos) = self.common_adjectives.get(&lower) {
278            return pos.clone();
279        }
280        if let Some(pos) = self.common_nouns.get(&lower) {
281            return pos.clone();
282        }
283
284        // Pattern-based tagging
285        // Proper noun: capitalized and not at start of sentence
286        if word.chars().next().unwrap().is_uppercase() {
287            return POSTag::ProperNoun;
288        }
289
290        // Verb patterns
291        if lower.ends_with("ing") {
292            return POSTag::VerbGerund;
293        }
294        if lower.ends_with("ed") {
295            return POSTag::VerbPast;
296        }
297
298        // Noun patterns (plural)
299        if lower.ends_with('s') && !lower.ends_with("ss") {
300            return POSTag::NounPlural;
301        }
302
303        // Adjective patterns
304        if lower.ends_with("ive") || lower.ends_with("ous") || lower.ends_with("ful") {
305            return POSTag::Adjective;
306        }
307
308        // Adverb patterns
309        if lower.ends_with("ly") {
310            return POSTag::Adverb;
311        }
312
313        // Default to noun
314        POSTag::Noun
315    }
316
317    /// Simple lemmatization
318    fn lemmatize(&self, word: &str, pos: &POSTag) -> String {
319        let lower = word.to_lowercase();
320
321        match pos {
322            POSTag::NounPlural => {
323                // Remove plural 's'
324                if lower.ends_with("ies") {
325                    return format!("{}y", &lower[..lower.len() - 3]);
326                }
327                if lower.ends_with('s') && !lower.ends_with("ss") {
328                    return lower[..lower.len() - 1].to_string();
329                }
330                lower
331            }
332            POSTag::VerbPast | POSTag::Verb3rdSing => {
333                // Remove -ed, -s
334                if lower.ends_with("ed") {
335                    return lower[..lower.len() - 2].to_string();
336                }
337                if lower.ends_with('s') {
338                    return lower[..lower.len() - 1].to_string();
339                }
340                lower
341            }
342            POSTag::VerbGerund => {
343                // Remove -ing
344                if lower.ends_with("ing") {
345                    return lower[..lower.len() - 3].to_string();
346                }
347                lower
348            }
349            _ => lower,
350        }
351    }
352
353    /// Parse dependencies (simplified)
354    pub fn parse_dependencies(&self, tokens: &[Token]) -> Result<Vec<Dependency>> {
355        let mut dependencies = Vec::new();
356
357        if tokens.is_empty() {
358            return Ok(dependencies);
359        }
360
361        // Find the main verb (root)
362        let root_idx = tokens
363            .iter()
364            .position(|t| matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing))
365            .unwrap_or(0);
366
367        // Find subject (noun/pronoun before verb)
368        for i in 0..root_idx {
369            if matches!(
370                tokens[i].pos,
371                POSTag::Noun | POSTag::ProperNoun | POSTag::Pronoun
372            ) {
373                dependencies.push(Dependency {
374                    head: root_idx,
375                    dependent: i,
376                    relation: DependencyRelation::Subject,
377                });
378                break;
379            }
380        }
381
382        // Find object (noun after verb)
383        for i in (root_idx + 1)..tokens.len() {
384            if matches!(tokens[i].pos, POSTag::Noun | POSTag::ProperNoun) {
385                dependencies.push(Dependency {
386                    head: root_idx,
387                    dependent: i,
388                    relation: DependencyRelation::DirectObject,
389                });
390                break;
391            }
392        }
393
394        // Find modifiers (adjectives before nouns, adverbs near verbs)
395        for i in 0..tokens.len() {
396            match tokens[i].pos {
397                POSTag::Adjective => {
398                    // Find next noun
399                    if let Some(noun_idx) =
400                        tokens[i + 1..].iter().position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
401                    {
402                        dependencies.push(Dependency {
403                            head: i + 1 + noun_idx,
404                            dependent: i,
405                            relation: DependencyRelation::Modifier,
406                        });
407                    }
408                }
409                POSTag::Adverb => {
410                    // Modify nearest verb
411                    let verb_idx = tokens.iter().position(|t| {
412                        matches!(t.pos, POSTag::Verb | POSTag::VerbPast | POSTag::Verb3rdSing)
413                    });
414                    if let Some(v_idx) = verb_idx {
415                        dependencies.push(Dependency {
416                            head: v_idx,
417                            dependent: i,
418                            relation: DependencyRelation::Modifier,
419                        });
420                    }
421                }
422                POSTag::Determiner => {
423                    // Determine next noun
424                    if let Some(noun_idx) =
425                        tokens[i + 1..].iter().position(|t| matches!(t.pos, POSTag::Noun | POSTag::ProperNoun))
426                    {
427                        dependencies.push(Dependency {
428                            head: i + 1 + noun_idx,
429                            dependent: i,
430                            relation: DependencyRelation::Determiner,
431                        });
432                    }
433                }
434                _ => {}
435            }
436        }
437
438        Ok(dependencies)
439    }
440
441    /// Extract noun phrases
442    pub fn extract_noun_phrases(&self, tokens: &[Token]) -> Result<Vec<NounPhrase>> {
443        let mut phrases = Vec::new();
444        let mut current_phrase: Vec<Token> = Vec::new();
445        let mut head_idx = 0;
446
447        for token in tokens {
448            match token.pos {
449                POSTag::Determiner | POSTag::Adjective => {
450                    // Start or continue noun phrase
451                    current_phrase.push(token.clone());
452                }
453                POSTag::Noun | POSTag::ProperNoun | POSTag::NounPlural | POSTag::ProperNounPlural => {
454                    // Add noun to phrase
455                    head_idx = current_phrase.len();
456                    current_phrase.push(token.clone());
457                }
458                _ => {
459                    // End of noun phrase
460                    if !current_phrase.is_empty() {
461                        let text = current_phrase
462                            .iter()
463                            .map(|t| t.text.as_str())
464                            .collect::<Vec<_>>()
465                            .join(" ");
466
467                        phrases.push(NounPhrase {
468                            tokens: current_phrase.clone(),
469                            head_idx,
470                            text,
471                        });
472
473                        current_phrase.clear();
474                        head_idx = 0;
475                    }
476                }
477            }
478        }
479
480        // Add final phrase if exists
481        if !current_phrase.is_empty() {
482            let text = current_phrase
483                .iter()
484                .map(|t| t.text.as_str())
485                .collect::<Vec<_>>()
486                .join(" ");
487
488            phrases.push(NounPhrase {
489                tokens: current_phrase,
490                head_idx,
491                text,
492            });
493        }
494
495        Ok(phrases)
496    }
497
498    /// Segment text into sentences
499    pub fn segment_sentences(&self, text: &str) -> Vec<String> {
500        let sentence_regex = Regex::new(r"[.!?]+\s+").unwrap();
501        sentence_regex
502            .split(text)
503            .map(|s| s.trim().to_string())
504            .filter(|s| !s.is_empty())
505            .collect()
506    }
507
508    // Dictionary builders
509    fn build_noun_dict() -> HashMap<String, POSTag> {
510        let nouns = vec![
511            "time", "person", "year", "way", "day", "thing", "man", "world", "life",
512            "hand", "part", "child", "eye", "woman", "place", "work", "week", "case",
513            "point", "government", "company", "number", "group", "problem", "fact",
514        ];
515        nouns.into_iter().map(|s| (s.to_string(), POSTag::Noun)).collect()
516    }
517
518    fn build_verb_dict() -> HashMap<String, POSTag> {
519        let verbs = vec![
520            "be", "have", "do", "say", "get", "make", "go", "know", "take", "see",
521            "come", "think", "look", "want", "give", "use", "find", "tell", "ask",
522            "work", "seem", "feel", "try", "leave", "call",
523        ];
524        verbs.into_iter().map(|s| (s.to_string(), POSTag::Verb)).collect()
525    }
526
527    fn build_adjective_dict() -> HashMap<String, POSTag> {
528        let adjectives = vec![
529            "good", "new", "first", "last", "long", "great", "little", "own", "other",
530            "old", "right", "big", "high", "different", "small", "large", "next",
531            "early", "young", "important", "few", "public", "bad", "same", "able",
532        ];
533        adjectives.into_iter().map(|s| (s.to_string(), POSTag::Adjective)).collect()
534    }
535
536    fn build_adverb_dict() -> HashMap<String, POSTag> {
537        let adverbs = vec![
538            "not", "so", "out", "up", "now", "only", "just", "more", "also", "very",
539            "well", "back", "there", "even", "still", "too", "here", "then", "always",
540            "never", "often", "quite", "really", "almost", "again",
541        ];
542        adverbs.into_iter().map(|s| (s.to_string(), POSTag::Adverb)).collect()
543    }
544
545    fn build_preposition_dict() -> HashMap<String, POSTag> {
546        let prepositions = vec![
547            "of", "in", "to", "for", "with", "on", "at", "from", "by", "about",
548            "into", "through", "during", "before", "after", "above", "below", "between",
549            "under", "since", "without", "within", "along", "among", "across",
550        ];
551        prepositions.into_iter().map(|s| (s.to_string(), POSTag::Preposition)).collect()
552    }
553
554    fn build_determiner_dict() -> HashMap<String, POSTag> {
555        let determiners = vec![
556            "the", "a", "an", "this", "that", "these", "those", "my", "your",
557            "his", "her", "its", "our", "their", "all", "both", "each", "every",
558            "some", "any", "no", "another", "such", "what", "which",
559        ];
560        determiners.into_iter().map(|s| (s.to_string(), POSTag::Determiner)).collect()
561    }
562
563    fn build_pronoun_dict() -> HashMap<String, POSTag> {
564        let pronouns = vec![
565            "i", "you", "he", "she", "it", "we", "they", "me", "him", "her",
566            "us", "them", "who", "whom", "what", "which", "this", "that",
567        ];
568        pronouns.into_iter().map(|s| (s.to_string(), POSTag::Pronoun)).collect()
569    }
570
571    fn build_conjunction_dict() -> HashMap<String, POSTag> {
572        let conjunctions = vec![
573            "and", "or", "but", "nor", "yet", "so", "for", "because", "although",
574            "though", "while", "if", "unless", "until", "when", "where",
575        ];
576        conjunctions.into_iter().map(|s| (s.to_string(), POSTag::Conjunction)).collect()
577    }
578}
579
580#[cfg(test)]
581mod tests {
582    use super::*;
583
584    #[test]
585    fn test_pos_tagging() {
586        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
587        let text = "The good brown fox jumps over the lazy dog.";
588
589        let tokens = analyzer.pos_tag(text).unwrap();
590
591        assert!(!tokens.is_empty());
592
593        // Check some expected tags
594        assert_eq!(tokens[0].pos, POSTag::Determiner); // "The"
595        assert_eq!(tokens[1].pos, POSTag::Adjective); // "good" (in dictionary)
596        assert!(matches!(tokens[3].pos, POSTag::Noun | POSTag::ProperNoun)); // "fox"
597        // "jumps" ends with 's' but may be tagged as plural noun, so we check it's present
598        assert!(tokens.iter().any(|t| t.text == "jumps"));
599    }
600
601    #[test]
602    fn test_lemmatization() {
603        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
604
605        assert_eq!(analyzer.lemmatize("running", &POSTag::VerbGerund), "runn");
606        assert_eq!(analyzer.lemmatize("cats", &POSTag::NounPlural), "cat");
607        assert_eq!(analyzer.lemmatize("jumped", &POSTag::VerbPast), "jump");
608    }
609
610    #[test]
611    fn test_noun_phrase_extraction() {
612        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
613        let text = "The quick brown fox";
614
615        let tokens = analyzer.pos_tag(text).unwrap();
616        let phrases = analyzer.extract_noun_phrases(&tokens).unwrap();
617
618        assert_eq!(phrases.len(), 1);
619        assert_eq!(phrases[0].text, "The quick brown fox");
620    }
621
622    #[test]
623    fn test_dependency_parsing() {
624        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
625        let text = "The cat chased the mouse";
626
627        let tokens = analyzer.pos_tag(text).unwrap();
628        let deps = analyzer.parse_dependencies(&tokens).unwrap();
629
630        // Should have subject and object dependencies
631        assert!(!deps.is_empty());
632
633        // Find subject dependency
634        let has_subject = deps.iter().any(|d| matches!(d.relation, DependencyRelation::Subject));
635        assert!(has_subject, "Should have subject dependency");
636    }
637
638    #[test]
639    fn test_sentence_segmentation() {
640        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
641        let text = "This is sentence one. This is sentence two! And sentence three?";
642
643        let sentences = analyzer.segment_sentences(text);
644
645        assert_eq!(sentences.len(), 3);
646        assert!(sentences[0].contains("sentence one"));
647        assert!(sentences[1].contains("sentence two"));
648        assert!(sentences[2].contains("sentence three"));
649    }
650
651    #[test]
652    fn test_tokenization() {
653        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
654        let text = "Hello, world!";
655
656        let tokens = analyzer.tokenize(text);
657
658        assert_eq!(tokens.len(), 4); // "Hello", ",", "world", "!"
659        assert_eq!(tokens[0].0, "Hello");
660        assert_eq!(tokens[1].0, ",");
661    }
662
663    #[test]
664    fn test_proper_noun_detection() {
665        let analyzer = SyntaxAnalyzer::new(SyntaxAnalyzerConfig::default());
666        let text = "John Smith lives in New York";
667
668        let tokens = analyzer.pos_tag(text).unwrap();
669
670        // Should detect proper nouns
671        let proper_nouns: Vec<_> = tokens.iter()
672            .filter(|t| matches!(t.pos, POSTag::ProperNoun))
673            .collect();
674
675        assert!(!proper_nouns.is_empty());
676    }
677}