browser_agent/
openai.rs

1use anyhow::{anyhow, Result};
2use async_openai::{
3    types::{ChatCompletionRequestMessage, CreateChatCompletionRequestArgs, Role},
4    Client,
5};
6use indoc::formatdoc;
7use tracing::debug;
8use url::Url;
9
10use crate::Action;
11
12/// A conversation with GPT-4.
13#[derive(Debug)]
14pub struct Conversation {
15    /// The goal for the agent to achieve.
16    goal: String,
17    /// The client used to communicate with OpenAI.
18    client: Client,
19    /// The URL of the current page.
20    url: Option<Url>,
21    /// A collection of messages sent to GPT-4.
22    messages: Vec<ChatCompletionRequestMessage>,
23}
24
25impl Conversation {
26    /// Create a new conversation with GPT-4.
27    #[must_use]
28    pub fn new(goal: String) -> Self {
29        Self {
30            goal,
31            url: None,
32            client: Client::new(),
33            messages: vec![ChatCompletionRequestMessage {
34                name: None,
35                role: Role::System,
36                content: formatdoc!("
37                    You are an agent controlling a browser. You are given an objective that you are trying to achieve, the URL of the current website, and a simplified markup description of the page contents, which looks like this:
38                    <p id=0>text</p>
39                    <link id=1 href=\"link url\">text</link>
40                    <button id=2>text</button>
41                    <input id=3>placeholder</input>
42                    <img id=4 alt=\"image description\"/>
43
44                    You must respond with ONLY one of the following commands AND NOTHING ELSE:
45                        - CLICK X - click on a given element. You can only click on links, buttons, and inputs!
46                        - TYPE X \"TEXT\" - type the specified text into the input with id X and press ENTER
47                        - ANSWER \"TEXT\" - Respond to the user with the specified text once you have completed the objective
48                "),
49        }]}
50    }
51
52    /// Request and execute an action from GPT-4.
53    #[tracing::instrument]
54    pub async fn request_action(&mut self, url: &str, page_content: &str) -> Result<Action> {
55        self.enforce_context_length(url)?;
56
57        self.messages.push(ChatCompletionRequestMessage {
58            name: None,
59            role: Role::User,
60            content: format!(
61                "OBJECTIVE: {}\nCURRENT URL: {url}\nPAGE CONTENT: {page_content}",
62                self.goal
63            ),
64        });
65
66        let response = self
67            .client
68            .chat()
69            .create(
70                CreateChatCompletionRequestArgs::default()
71                    .model("gpt-4")
72                    .temperature(0.7f32)
73                    .max_tokens(100u16)
74                    .messages(self.messages.clone())
75                    .build()?,
76            )
77            .await?;
78
79        debug!(
80            "Got a response, used {} tokens.",
81            response
82                .usage
83                .expect("Usage should be present.")
84                .total_tokens
85        );
86
87        let message = &response
88            .choices
89            .get(0)
90            .ok_or_else(|| anyhow!("No choices returned from OpenAI.",))?
91            .message;
92
93        self.messages.push(ChatCompletionRequestMessage {
94            name: None,
95            role: message.role.clone(),
96            content: message.content.clone(),
97        });
98
99        message.content.clone().try_into()
100    }
101
102    fn enforce_context_length(&mut self, url: &str) -> Result<()> {
103        let new_url = Url::parse(url)?;
104
105        if self.url.as_ref().map(Url::host) != Some(new_url.host()) {
106            debug!("Host changed, clearing context.");
107            self.messages = self.messages.drain(..1).collect();
108        }
109
110        self.url = Some(new_url);
111        Ok(())
112    }
113}