Skip to main content

perspt_coding/
symbols.rs

1//! Symbol extraction for the goal-presence sensor (PSP-8).
2//!
3//! The SDK's [`perspt_sdk::goal`] sensor compares *names*: the symbols a node is
4//! required to produce against the symbols actually defined in the workspace.
5//! Turning a coding task contract into expected names, and source files into
6//! defined names, is domain knowledge — it lives here, not in the kernel.
7//!
8//! The extractors are deliberately lightweight, language-shared scanners over
9//! Rust, Python, and TypeScript declaration keywords plus backtick-quoted
10//! identifiers in a goal description. They are intentionally *conservative*:
11//! they only ever name top-level declarable identifiers, so the goal-presence
12//! sensor never invents an obligation the planner did not express.
13
14use std::collections::BTreeSet;
15
16/// Declaration keywords across the supported languages whose following token is
17/// a defined symbol name.
18const DECL_KEYWORDS: &[&str] = &[
19    "fn",
20    "def",
21    "function",
22    "struct",
23    "enum",
24    "trait",
25    "class",
26    "interface",
27    "type",
28];
29
30/// True for an identifier start character (ASCII letter or underscore).
31fn is_ident_start(c: char) -> bool {
32    c.is_ascii_alphabetic() || c == '_'
33}
34
35/// True for an identifier continuation character.
36fn is_ident_continue(c: char) -> bool {
37    c.is_ascii_alphanumeric() || c == '_'
38}
39
40/// Extract the identifier that begins at `chars[i..]`, returning it and the
41/// index just past it. Assumes `chars[i]` is an identifier start.
42fn take_ident(chars: &[char], i: usize) -> (String, usize) {
43    let mut j = i;
44    while j < chars.len() && is_ident_continue(chars[j]) {
45        j += 1;
46    }
47    (chars[i..j].iter().collect(), j)
48}
49
50/// Names declared in `source` via a `DECL_KEYWORDS` keyword.
51///
52/// Scans token by token: every time a declaration keyword is seen as a whole
53/// word, the next identifier token is recorded as a defined symbol. This catches
54/// `pub fn multiply(...)`, `def is_even(n):`, `export function f()`,
55/// `struct Foo`, `class Bar`, etc. without a full parser.
56pub fn defined_symbols(source: &str) -> BTreeSet<String> {
57    let chars: Vec<char> = source.chars().collect();
58    let mut out = BTreeSet::new();
59    let mut i = 0;
60    while i < chars.len() {
61        if is_ident_start(chars[i]) {
62            // Word boundary before: previous char must not be ident-continue.
63            let at_boundary = i == 0 || !is_ident_continue(chars[i - 1]);
64            let (word, next) = take_ident(&chars, i);
65            if at_boundary && DECL_KEYWORDS.contains(&word.as_str()) {
66                // Skip whitespace, then take the declared name.
67                let mut k = next;
68                while k < chars.len() && chars[k].is_whitespace() {
69                    k += 1;
70                }
71                if k < chars.len() && is_ident_start(chars[k]) {
72                    let (name, end) = take_ident(&chars, k);
73                    out.insert(name);
74                    i = end;
75                    continue;
76                }
77            }
78            i = next;
79        } else {
80            i += 1;
81        }
82    }
83    out
84}
85
86/// Names the goal requires to exist, drawn from a declared interface signature
87/// and the natural-language goal text.
88///
89/// Two sources, both conservative:
90/// 1. `interface_signature` — the contract's declared public API. Declaration
91///    keywords there name required symbols directly.
92/// 2. `goal` — backtick-quoted identifiers (``` `multiply` ```), and identifiers
93///    immediately followed by `(` (a call/definition shape, e.g.
94///    `` `is_even(n: i32)` ``). Prose words are ignored unless they carry one of
95///    those code shapes, so a chatty goal does not manufacture obligations.
96pub fn expected_symbols(interface_signature: &str, goal: &str) -> Vec<String> {
97    let mut ordered: Vec<String> = Vec::new();
98    let mut seen: BTreeSet<String> = BTreeSet::new();
99
100    let mut push = |name: String| {
101        if name.len() >= 2
102            && is_ident_start(name.chars().next().unwrap())
103            && seen.insert(name.clone())
104        {
105            ordered.push(name);
106        }
107    };
108
109    // 1. Declared interface signature: declaration-keyword names.
110    for name in defined_symbols(interface_signature) {
111        push(name);
112    }
113
114    // 2. Goal text: backtick spans and `ident(` call shapes.
115    for span in backtick_spans(goal) {
116        // Inside a span, capture both declaration-keyword names and `ident(`.
117        for name in defined_symbols(&span) {
118            push(name);
119        }
120        for name in call_shaped_idents(&span) {
121            push(name);
122        }
123        // A span that is *just* a bare identifier (e.g. "implement `lcm`") names
124        // a required symbol directly — unless it is a primitive type or language
125        // keyword that prose routinely quotes (e.g. `i32`, `bool`).
126        let trimmed = span.trim();
127        if is_bare_identifier(trimmed) && !is_primitive_or_keyword(trimmed) {
128            push(trimmed.to_string());
129        }
130    }
131    // Also catch un-quoted `ident(` shapes in the bare goal text.
132    for name in call_shaped_idents(goal) {
133        push(name);
134    }
135
136    ordered
137}
138
139/// True when `s` is a single bare identifier (ident-start then ident-continue,
140/// nothing else) — so `lcm` qualifies but `src/lib.rs`, `i32-based`, and
141/// `fn foo` do not.
142fn is_bare_identifier(s: &str) -> bool {
143    let mut chars = s.chars();
144    match chars.next() {
145        Some(c) if is_ident_start(c) => chars.all(is_ident_continue),
146        _ => false,
147    }
148}
149
150/// Primitive types and language keywords that appear back-ticked in prose but
151/// are never the *symbol the goal asks to create*. Kept deliberately small and
152/// cross-language (Rust / Python / TypeScript) so the goal-presence sensor does
153/// not manufacture an obligation for a type name.
154fn is_primitive_or_keyword(name: &str) -> bool {
155    const DENY: &[&str] = &[
156        // Rust integer / float / core scalar types.
157        "i8",
158        "i16",
159        "i32",
160        "i64",
161        "i128",
162        "isize",
163        "u8",
164        "u16",
165        "u32",
166        "u64",
167        "u128",
168        "usize",
169        "f32",
170        "f64",
171        "bool",
172        "char",
173        "str",
174        "String",
175        "Vec",
176        "Option",
177        "Result",
178        "Box",
179        "Self",
180        // Common literals / keywords.
181        "self",
182        "true",
183        "false",
184        "None",
185        "Some",
186        "Ok",
187        "Err",
188        "fn",
189        "def",
190        "function",
191        "struct",
192        "enum",
193        "trait",
194        "class",
195        "interface",
196        "type",
197        "pub",
198        "let",
199        "const",
200        "mut",
201        "async",
202        "await",
203        "return",
204        "if",
205        "else",
206        "for",
207        "while",
208        "match",
209        // TypeScript / Python primitives.
210        "number",
211        "string",
212        "boolean",
213        "void",
214        "any",
215        "unknown",
216        "int",
217        "float",
218        "double",
219        "long",
220        "short",
221        "byte",
222        "object",
223        "null",
224        "undefined",
225    ];
226    DENY.contains(&name)
227}
228
229/// The contents of every back-tick delimited span in `text`.
230fn backtick_spans(text: &str) -> Vec<String> {
231    let mut spans = Vec::new();
232    let mut current: Option<String> = None;
233    for c in text.chars() {
234        if c == '`' {
235            match current.take() {
236                Some(s) => spans.push(s),
237                None => current = Some(String::new()),
238            }
239        } else if let Some(buf) = current.as_mut() {
240            buf.push(c);
241        }
242    }
243    spans
244}
245
246/// Identifiers *immediately* followed by `(`, e.g. the `is_even` in
247/// `is_even(n: i32)`. Whitespace before the paren is NOT allowed, so prose like
248/// "Reverse Polish Notation (RPN)" does not misread "Notation" as a call.
249/// Keywords are excluded so control-flow words like `if(` are never symbols.
250fn call_shaped_idents(text: &str) -> Vec<String> {
251    const NOISE: &[&str] = &[
252        "if", "for", "while", "match", "switch", "return", "fn", "def", "function",
253    ];
254    let chars: Vec<char> = text.chars().collect();
255    let mut out = Vec::new();
256    let mut i = 0;
257    while i < chars.len() {
258        if is_ident_start(chars[i]) && (i == 0 || !is_ident_continue(chars[i - 1])) {
259            let (name, next) = take_ident(&chars, i);
260            if next < chars.len() && chars[next] == '(' && !NOISE.contains(&name.as_str()) {
261                out.push(name);
262            }
263            i = next;
264        } else {
265            i += 1;
266        }
267    }
268    out
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274
275    fn set(names: &[&str]) -> BTreeSet<String> {
276        names.iter().map(|s| s.to_string()).collect()
277    }
278
279    #[test]
280    fn rust_defined_symbols() {
281        let src = "pub fn multiply(a: i32, b: i32) -> i32 { a * b }\nstruct Pair { a: i32 }";
282        assert_eq!(defined_symbols(src), set(&["multiply", "Pair"]));
283    }
284
285    #[test]
286    fn python_defined_symbols() {
287        let src = "def is_even(n):\n    return n % 2 == 0\nclass Calc:\n    pass";
288        assert_eq!(defined_symbols(src), set(&["is_even", "Calc"]));
289    }
290
291    #[test]
292    fn typescript_defined_symbols() {
293        let src = "export function add(a: number, b: number) { return a + b }\ninterface Shape {}";
294        assert_eq!(defined_symbols(src), set(&["add", "Shape"]));
295    }
296
297    #[test]
298    fn placeholder_file_defines_nothing() {
299        assert!(defined_symbols("// implement here\n").is_empty());
300    }
301
302    #[test]
303    fn keyword_substring_is_not_a_declaration() {
304        // `define` contains `def` but is not the `def` keyword.
305        assert!(defined_symbols("define_macro_helper = 1").is_empty());
306    }
307
308    #[test]
309    fn expected_from_interface_signature() {
310        let expected = expected_symbols("pub fn is_even(n: i32) -> bool", "");
311        assert_eq!(expected, vec!["is_even"]);
312    }
313
314    #[test]
315    fn expected_from_backticked_goal() {
316        let expected = expected_symbols(
317            "",
318            "Add a public function `multiply(a: i32, b: i32) -> i32` that returns a*b.",
319        );
320        assert_eq!(expected, vec!["multiply"]);
321    }
322
323    #[test]
324    fn expected_from_call_shape_in_goal() {
325        let expected = expected_symbols("", "Implement is_even(n) returning true for even n.");
326        assert_eq!(expected, vec!["is_even"]);
327    }
328
329    #[test]
330    fn prose_goal_yields_no_false_obligation() {
331        let expected =
332            expected_symbols("", "Refactor the module for clarity and improve the docs.");
333        assert!(expected.is_empty());
334    }
335
336    #[test]
337    fn control_flow_words_are_not_symbols() {
338        let expected = expected_symbols("", "if (x) do something; while (y) loop.");
339        assert!(expected.is_empty());
340    }
341
342    #[test]
343    fn prose_word_before_spaced_paren_is_not_a_symbol() {
344        // Regression: "Reverse Polish Notation (RPN)" must not yield "Notation"
345        // — a space before '(' means it is prose, not a call.
346        let expected = expected_symbols(
347            "",
348            "Build a Reverse Polish Notation (RPN) calculator library.",
349        );
350        assert!(expected.is_empty(), "got {expected:?}");
351    }
352
353    #[test]
354    fn bare_backtick_identifier_is_expected() {
355        // The case that slipped through: "Implement `lcm`" with no parens.
356        let expected = expected_symbols("", "Implement `lcm` in src/lib.rs with a unit test.");
357        assert_eq!(expected, vec!["lcm"]);
358    }
359
360    #[test]
361    fn backticked_primitive_type_is_not_an_obligation() {
362        // `i32` and `src/lib.rs` are quoted prose, not symbols to create.
363        let expected = expected_symbols("", "Use an `i32`-based signature; write to `src/lib.rs`.");
364        assert!(expected.is_empty(), "got {expected:?}");
365    }
366
367    #[test]
368    fn is_bare_identifier_rejects_paths_and_snippets() {
369        assert!(is_bare_identifier("lcm"));
370        assert!(is_bare_identifier("is_even"));
371        assert!(!is_bare_identifier("src/lib.rs"));
372        assert!(!is_bare_identifier("fn foo"));
373        assert!(!is_bare_identifier("a*b"));
374        assert!(!is_bare_identifier(""));
375    }
376}