patent 0.3.0

A prior-art search for your code ideas — has this dev tool already been shipped?
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
//! Verdict generation.
//!
//! Builds a prompt from the ranked matches and asks an LLM backend (Ollama or
//! any OpenAI-compatible API) for a scoped verdict. The prompt **forbids claiming
//! non-existence**: results are always phrased as "found in the sources checked",
//! and a clean result means "keep looking before committing", never a green light.

use crate::llm::Llm;
use crate::model::{Match, Query, Saturation, Source, Verdict};

/// The fixed humble caveat shown on every verdict. Never weaken this.
pub const CAVEAT: &str = "Not proof it doesn't exist — only that nothing close turned up \
in the sources checked. Keep looking (web, app stores, niche communities) before committing.";

/// Render the list of sources actually searched, for the prompt.
fn source_list(sources_checked: &[Source]) -> String {
    if sources_checked.is_empty() {
        return "the selected open-source registries".to_string();
    }
    sources_checked
        .iter()
        .map(|s| s.to_string())
        .collect::<Vec<_>>()
        .join(", ")
}

/// Build the LLM prompt enforcing the integrity rules.
///
/// `sources_checked` must be the sources that actually responded — the prompt
/// only ever tells the model about coverage that really happened, so the model
/// can't be steered into claiming a source was searched when it wasn't.
pub fn build_prompt(query: &Query, matches: &[Match], sources_checked: &[Source]) -> String {
    let mut prompt = String::new();

    prompt.push_str(&format!(
        "You are a skeptical prior-art analyst for SOFTWARE DEVELOPER TOOLS ONLY. Your \
         default assumption is that the idea has already been built — lean toward Crowded \
         or Saturated when in doubt. The user has an idea for a dev tool and we searched \
         these open-source sources for existing implementations: {}.\n\n",
        source_list(sources_checked),
    ));

    prompt.push_str(&format!("## Idea\n{}\n\n", query.idea));

    if matches.is_empty() {
        prompt.push_str("## Matches\nNo matches were found in the sources checked.\n\n");
    } else {
        let top10: Vec<&Match> = matches.iter().take(10).collect();
        let avg_sim: f32 = top10.iter().map(|m| m.similarity).sum::<f32>() / top10.len() as f32;

        prompt.push_str("## Matches found (ranked by cosine similarity to the idea)\n");
        prompt.push_str(&format!(
            "Top-10 average similarity: {:.2} (scale: 0.0 = unrelated, 0.5 = tangential, \
             0.7+ = strong match)\n\n",
            avg_sim,
        ));
        for m in matches.iter().take(15) {
            prompt.push_str(&format!(
                "- **{}** ({}, sim {:.2}): {}\n",
                m.name, m.source, m.similarity, m.description,
            ));
        }
        if matches.len() > 15 {
            prompt.push_str(&format!(
                "- … and {} more with lower similarity\n",
                matches.len() - 15
            ));
        }
        prompt.push('\n');
    }

    prompt.push_str(
        "## Rules — you MUST follow these\n\
         - You can prove something EXISTS; you must NEVER claim something does not exist.\n\
         - All conclusions must be scoped to \"found in the sources checked\".\n\
         - Do not say \"this doesn't exist\" or \"there is no prior art\" — only that \
           nothing close turned up in the sources checked.\n\
         - If the idea is NOT about software, developer tools, or programming, respond \
           with level \"Open\" and headline \"This does not appear to be a software tool \
           idea — patent searches developer tool registries only.\"\n\
         - Focus ONLY on matches that directly address the SPECIFIC feature described in \
           the idea. Generic or tangential tools (e.g. a generic linter when the idea is \
           a specific kind of linter) do NOT count as prior art.\n\n",
    );

    prompt.push_str(
        "## How to choose the level\n\
         Use the similarity scores — they measure how closely each match relates to the idea:\n\
         - **Open**: no match has similarity >= 0.55, OR matches are only tangentially \
           related (they share a category but not the specific feature).\n\
         - **Crowded**: at least 2-3 matches with similarity >= 0.55 that directly \
           address the same problem.\n\
         - **Saturated**: 5+ strong matches (>= 0.60) covering the idea with little room \
           for differentiation.\n\n",
    );

    prompt.push_str(
        "## Output\n\
         Respond with ONLY a JSON object (no markdown fences, no commentary):\n\
         ```\n\
         {\n  \
           \"level\": \"Open\" | \"Crowded\" | \"Saturated\",\n  \
           \"headline\": \"one-sentence summary scoped to sources checked\",\n  \
           \"gaps\": [\"gap the user could fill\", ...]\n\
         }\n\
         ```\n\
         The headline MUST describe the user's idea and its closest matches above \
         — never an unrelated tool from the list — and must be scoped to the \
         sources checked. Never claim the idea does not exist or has no prior art.\n",
    );

    prompt
}

/// Phrases that assert non-existence. The integrity rule forbids ever telling a
/// user their idea doesn't exist (we only searched some sources), so if the
/// model emits one of these despite the prompt, we replace the text.
///
/// This is a deliberately broad, conservative backstop: a false positive only
/// downgrades the copy to a safe scoped headline, whereas a false negative is
/// an integrity violation — so we err toward catching more.
const ABSENCE_PHRASES: &[&str] = &[
    "does not exist",
    "doesn't exist",
    "do not exist",
    "don't exist",
    "no prior art",
    "nothing exists",
    "nothing like this",
    "never been built",
    "never been made",
    "never been implemented",
    "has not been built",
    "hasn't been built",
    "has not been implemented",
    "hasn't been implemented",
    "not been implemented",
    "no one has built",
    "no one has made",
    "no one else",
    "nobody else",
    "no one is doing",
    "no such tool",
    "no existing tool",
    "no existing solution",
    "no existing implementation",
    "no similar tool",
    "no similar project",
    "no comparable",
    "no competitors",
    "no alternatives",
    "no equivalent",
    "there is no tool",
    "there are no tools",
    "there is no existing",
    "there is no software",
    "there is no prior",
    "completely novel",
    "entirely new",
    "brand new concept",
    "first of its kind",
    "unprecedented",
];

/// True if `text` asserts that something does not exist.
fn contains_absence_phrase(text: &str) -> bool {
    let lower = text.to_lowercase();
    ABSENCE_PHRASES.iter().any(|p| lower.contains(p))
}

/// Phrases claiming nothing was found. Fine when matches are weak, but misleading
/// when a genuinely close match is present, so they are guarded against below.
const NO_MATCH_PHRASES: &[&str] = &[
    "no direct match",
    "no close match",
    "no matching",
    "no matches found",
    "no match found",
    "no relevant match",
    "no clear match",
    "no exact match",
    "nothing closely related",
    "no direct prior art",
    "couldn't find any",
    "could not find any",
];

/// True if `text` claims nothing was found.
fn claims_no_match(text: &str) -> bool {
    let lower = text.to_lowercase();
    NO_MATCH_PHRASES.iter().any(|p| lower.contains(p))
}

/// A safe, scoped headline derived purely from the data — never asserts absence.
fn data_headline(level: Saturation, matches: &[Match]) -> String {
    let close = matches.iter().filter(|m| m.similarity >= 0.55).count();
    match level {
        Saturation::Saturated => {
            format!("Saturated — {close} closely-related tools turned up in the sources checked.")
        }
        Saturation::Crowded => format!(
            "Crowded — {close} closely-related tool{} turned up in the sources checked.",
            if close == 1 { "" } else { "s" }
        ),
        Saturation::Open => {
            if close == 0 {
                "Nothing close turned up in the sources checked — keep looking before committing."
                    .to_string()
            } else {
                format!(
                    "{close} closely-related tool{} turned up, but the space still looks open in the sources checked. Worth a look before committing.",
                    if close == 1 { "" } else { "s" }
                )
            }
        }
    }
}

/// Replace any headline that asserts non-existence with a safe scoped one. This
/// is the code-level guarantee behind the integrity rule; the prompt asks the
/// model to comply, but we never *rely* on it.
fn guard_headline(headline: String, level: Saturation, matches: &[Match]) -> String {
    if contains_absence_phrase(&headline) {
        data_headline(level, matches)
    } else {
        headline
    }
}

/// Floor the model's level against the similarity data so it can never hand out
/// a green-light "Open" when the embeddings clearly show close prior art.
fn floor_level(model_level: Saturation, matches: &[Match]) -> Saturation {
    let strong = matches.iter().filter(|m| m.similarity >= 0.60).count();
    let close = matches.iter().filter(|m| m.similarity >= 0.55).count();
    // A single near-identical match (>= 0.70) already means the space isn't open.
    let very_strong = matches.iter().filter(|m| m.similarity >= 0.70).count();
    let data_level = if strong >= 5 {
        Saturation::Saturated
    } else if close >= 2 || very_strong >= 1 {
        Saturation::Crowded
    } else {
        Saturation::Open
    };
    model_level.max(data_level)
}

/// True if `word` appears in `text` as a whole word (not as part of another word).
fn is_whole_word(text: &str, word: &str) -> bool {
    let mut remaining = text;
    while let Some(pos) = remaining.find(word) {
        let before_ok = remaining[..pos]
            .chars()
            .next_back()
            .map_or(true, |c| !c.is_alphanumeric());
        let after_ok = remaining[pos + word.len()..]
            .chars()
            .next()
            .map_or(true, |c| !c.is_alphanumeric());
        if before_ok && after_ok {
            return true;
        }
        remaining = &remaining[pos + word.len()..];
        if remaining.is_empty() {
            break;
        }
    }
    false
}

/// Extract JSON from a model response that may be wrapped in markdown fences.
fn extract_json(raw: &str) -> &str {
    let trimmed = raw.trim();
    if let Some(start) = trimmed.find("```") {
        let after_fence = &trimmed[start + 3..];
        let content = after_fence
            .strip_prefix("json")
            .unwrap_or(after_fence)
            .trim_start();
        if let Some(end) = content.find("```") {
            return content[..end].trim();
        }
    }
    trimmed
}

/// Parse the model's JSON response into the verdict fields we need, then apply
/// the two integrity guards: floor the level against the similarity data, and
/// replace any headline that asserts non-existence.
fn parse_verdict(
    raw: &str,
    matches: &[Match],
    sources_checked: Vec<Source>,
    sources_failed: Vec<Source>,
) -> crate::Result<Verdict> {
    let json_str = extract_json(raw);

    let v: serde_json::Value =
        serde_json::from_str(json_str).map_err(|e| crate::Error::Parse(e.to_string()))?;

    let model_level = match v["level"].as_str() {
        Some("Open") => Saturation::Open,
        Some("Crowded") => Saturation::Crowded,
        Some("Saturated") => Saturation::Saturated,
        other => return Err(crate::Error::Parse(format!("invalid level: {:?}", other))),
    };

    let raw_headline = v["headline"]
        .as_str()
        .ok_or_else(|| crate::Error::Parse("missing 'headline'".into()))?
        .to_string();

    // Gaps render verbatim, so they get the same absence-claim guard as the
    // headline: a gap that asserts non-existence is dropped rather than shown.
    // Also drop any gap that names a top-10 match — if the model mentions a
    // known tool in a gap it is confirming it exists, not identifying open space.
    let top_names: Vec<String> = matches
        .iter()
        .take(10)
        .map(|m| m.name.to_lowercase())
        .collect();
    let gaps: Vec<String> = match v["gaps"].as_array() {
        Some(arr) => arr
            .iter()
            .filter_map(|g| g.as_str().map(String::from))
            .filter(|g| !contains_absence_phrase(g))
            .filter(|g| {
                let lower = g.to_lowercase();
                !top_names.iter().any(|name| is_whole_word(&lower, name))
            })
            .collect(),
        None => vec![],
    };

    // Floor the level against the data. If that raises it, the model misjudged
    // the space, so we don't trust its headline either — derive a safe one.
    let level = floor_level(model_level, matches);
    let headline = if level != model_level {
        data_headline(level, matches)
    } else {
        raw_headline
    };
    let headline = guard_headline(headline, level, matches);

    // A close match (>= 0.55) is real prior art, so a "found nothing" headline
    // would be misleading even when the level stays Open. Replace it with the
    // data-derived headline, which names the close matches.
    let close = matches.iter().filter(|m| m.similarity >= 0.55).count();
    let headline = if close >= 1 && claims_no_match(&headline) {
        data_headline(level, matches)
    } else {
        headline
    };

    Ok(Verdict {
        level,
        headline,
        gaps,
        sources_checked,
        sources_failed,
        caveat: CAVEAT.to_string(),
    })
}

/// Build a verdict from the similarity data alone, without calling a model.
///
/// This is the `--fast` path (and any caller that deliberately skips Ollama).
/// The saturation level is derived purely by `floor_level`-ing against the
/// embeddings and the headline is the same safe, scoped, data-only sentence the
/// flooring guard produces — so a no-LLM run still gives an honest signal,
/// never flashes a misleading green "Open" over a clearly-populated space, and
/// still carries the fixed integrity [`CAVEAT`]. Gaps require a model, so they
/// are empty here.
pub fn from_data(
    matches: &[Match],
    sources_checked: Vec<Source>,
    sources_failed: Vec<Source>,
) -> Verdict {
    let level = floor_level(Saturation::Open, matches);
    Verdict {
        headline: data_headline(level, matches),
        level,
        gaps: vec![],
        sources_checked,
        sources_failed,
        caveat: CAVEAT.to_string(),
    }
}

/// Produce a [`Verdict`] from ranked matches via an [`Llm`] backend.
pub async fn assess(
    llm: &dyn Llm,
    query: &Query,
    matches: &[Match],
    sources_checked: Vec<Source>,
    sources_failed: Vec<Source>,
) -> crate::Result<Verdict> {
    let prompt = build_prompt(query, matches, &sources_checked);
    let raw = match llm.generate(&prompt).await {
        Ok(r) => r,
        Err(_) => {
            tokio::time::sleep(std::time::Duration::from_millis(800)).await;
            llm.generate(&prompt).await?
        }
    };
    parse_verdict(&raw, matches, sources_checked, sources_failed)
}