ipa-lab
diff --git a/‎.env.example‎
Lines changed: 6 additions & 0 deletions b/‎.env.example‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 11 additions & 19 deletions b/‎README.md‎
Lines changed: 11 additions & 19 deletions
diff --git a/‎config.py‎
Lines changed: 0 additions & 24 deletions b/‎config.py‎
Lines changed: 0 additions & 24 deletions
diff --git a/‎db_storage.py‎
Lines changed: 87 additions & 0 deletions b/‎db_storage.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎handlers.py‎
Lines changed: 27 additions & 0 deletions b/‎handlers.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎helper.py‎
Lines changed: 49 additions & 0 deletions b/‎helper.py‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎history.py‎
Lines changed: 0 additions & 67 deletions b/‎history.py‎
Lines changed: 0 additions & 67 deletions
diff --git a/‎history_notes.md‎
Lines changed: 18 additions & 0 deletions b/‎history_notes.md‎
Lines changed: 18 additions & 0 deletions
@@ -8,3 +8,9 @@ TARGET_IP='enter-the-private-ip-of-some-vm.local'
 # exchange with the user for your target VM
 TARGET_USER='bob'
 TARGET_PASSWORD='secret'
+
+# which LLM driver to use (can be openai_rest or oobabooga for now)
+LLM_CONNECTION = "openai_rest"
+
+# how many rounds should this thing go?
+MAX_ROUNDS = 20
@@ -6,6 +6,15 @@ This is a small python script that I use to prototype some potential use-cases w
 
 What is it doing? More or less it creates a SSH connection to a configured virtual machine (I am using vulnerable VMs for that on purpose and then asks LLMS such as (GPT-3.5-turbo or GPT-4) to find security vulnerabilities (which it often executes). Evicts a bit of an eerie feeling for me.
 
+Current features:
+
+- connects over SSH
+- supports multiple openai models (gpt-3.5-turbo, gpt4, gpt-3.5-turbo-16k, etc.)
+- beautiful console output
+- log storage in sqlite either into a file or in-memory
+- automatic (very rough) root detection
+- can limit rounds (how often the LLM will be asked for a new command)
+
 ### Vision Paper
 
 hackingBuddyGPT is described in the paper [Getting pwn'd by AI: Penetration Testing with Large Language Models ](https://arxiv.org/abs/2308.00121).
@@ -31,6 +40,8 @@ series = {ESEC/FSE 2023}
 
 # Example runs
 
+- more can be seen at [history notes](https://github.com/ipa-lab/hackingBuddyGPT/blob/v3/history_notes.md)
+
 ## updated version using GPT-4
 
 This happened during a recent run:
@@ -45,25 +56,6 @@ Some things to note:
 
 In this case GPT-4 wanted to exploit a vulnerable cron script (to which it had write access), sadly I forgot to enable cron in the VM.
 
-## initial version (tagged as fse23-ivr) using gpt-3.5-turbo
-
-This happened during a recent run:
-
-![Example wintermute run](example_run.png)
-
-Some things to note:
-
-- prompts for GPT-3 are prefixed with `openai-prompt`, the returned command from GPT-3 is prefixed with `openai-next-command` and the result from executing the command with `server-output`
-- the used SSH-library also displays the output produced by the commands executed through SSH --- this is why some stuff appears twice
-- I've added a simple callback that automatically enters the configured account's credentials if sudo prompts for a password
-
-So, what is acutally happening when executing wintermute?
-
-- wintermute executed `id` initially to get the user's id
-- the next command was `sudo -l`, listing the current users sudo permissions
-- wintermute then executes `sudo /bin/bash` and we're dropped into an interactive root shell
-
-
 ## High-Level Description
 
 This tool uses SSH to connect to a (presumably) vulnerable virtual machine and then asks OpenAI GPT to suggest linux commands that could be used for finding security vulnerabilities or privilege escalatation. The provided command is then executed within the virtual machine, the output fed back to the LLM and, finally, a new command is requested from it..
 
@@ -0,0 +1,87 @@
+import sqlite3
+
+class DbStorage:
+    def __init__(self, connection_string=":memory:"):
+        self.connection_string = connection_string
+    
+    def connect(self):
+        self.db = sqlite3.connect(self.connection_string)
+        self.cursor = self.db.cursor()
+
+    def insert_or_select_cmd(self, name:str) -> int:
+        results = self.cursor.execute("SELECT id, name FROM commands WHERE name = ?", (name, )).fetchall()
+
+        if len(results) == 0:
+            self.cursor.execute("INSERT INTO commands (name) VALUES (?)", (name, ))
+            return self.cursor.lastrowid
+        elif len(results) == 1:
+            return results[0][0]
+        else:
+            print("this should not be happening: " + str(results))
+            return -1
+    
+    def setup_db(self):
+        # create tables
+        self.cursor.execute("CREATE TABLE IF NOT EXISTS runs (id INTEGER PRIMARY KEY, model text, context_size INTEGER, state TEXT, tag TEXT)")
+        self.cursor.execute("CREATE TABLE IF NOT EXISTS commands (id INTEGER PRIMARY KEY, name string unique)")
+        self.cursor.execute("CREATE TABLE IF NOT EXISTS queries (run_id INTEGER, round INTEGER, cmd_id INTEGER, query TEXT, response TEXT, duration REAL, tokens_query INTEGER, tokens_response INTEGER)")
+
+        # insert commands
+        self.query_cmd_id = self.insert_or_select_cmd('query_cmd')
+        self.analyze_response_id = self.insert_or_select_cmd('analyze_response')
+        self.state_update_id = self.insert_or_select_cmd('update_state')
+
+    def create_new_run(self, model, context_size, tag=''):
+        self.cursor.execute("INSERT INTO runs (model, context_size, state, tag) VALUES (?, ?, ?, ?)", (model, context_size, "in progress", tag))
+        return self.cursor.lastrowid
+
+    def add_log_query(self, run_id, round, cmd, result, answer):
+        self.cursor.execute("INSERT INTO queries (run_id, round, cmd_id, query, response, duration, tokens_query, tokens_response) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (run_id, round, self.query_cmd_id, cmd, result, answer.duration, answer.tokens_query, answer.tokens_response))
+
+    def add_log_analyze_response(self, run_id, round, cmd, result, answer):
+        self.cursor.execute("INSERT INTO queries (run_id, round, cmd_id, query, response, duration, tokens_query, tokens_response) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (run_id, round, self.analyze_response_id, cmd, result, answer.duration, answer.tokens_query, answer.tokens_response))
+
+    def add_log_update_state(self, run_id, round, cmd, result, answer):
+
+        if answer != None:
+            self.cursor.execute("INSERT INTO queries (run_id, round, cmd_id, query, response, duration, tokens_query, tokens_response) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (run_id, round, self.state_update_id, cmd, result, answer.duration, answer.tokens_query, answer.tokens_response))
+        else:
+            self.cursor.execute("INSERT INTO queries (run_id, round, cmd_id, query, response, duration, tokens_query, tokens_response) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (run_id, round, self.state_update_id, cmd, result, 0, 0, 0))
+
+    def get_round_data(self, run_id, round):
+        rows = self.cursor.execute("select cmd_id, query, response, duration, tokens_query, tokens_response from queries where run_id = ? and round = ?", (run_id, round)).fetchall()
+
+        for row in rows:
+            if row[0] == self.query_cmd_id:
+                cmd = row[1]
+                size_resp = str(len(row[2]))
+                duration = f"{row[3]:.4f}"
+                tokens = f"{row[4]}/{row[5]}"
+            if row[0] == self.analyze_response_id:
+                reason = row[2]
+                analyze_time = f"{row[3]:.4f}"
+                analyze_token = f"{row[4]}/{row[5]}"
+
+        result = [duration, tokens, cmd, size_resp, analyze_time, analyze_token, reason]
+        return result
+
+    def get_cmd_history(self, run_id):
+        rows = self.cursor.execute("select query, response from queries where run_id = ? and cmd_id = ? order by round asc", (run_id, self.query_cmd_id)).fetchall()
+
+        result = []
+
+        for row in rows:
+            result.append([row[0], row[1]])
+
+        return result
+    
+    def run_was_success(self, run_id):
+        self.cursor.execute("update runs set state=? where id = ?", ("got root", run_id))
+        self.db.commit()
+
+    def run_was_failure(self, run_id):
+        self.cursor.execute("update runs set state=? where id = ?", ("reached max runs", run_id))
+        self.db.commit()
+
+    def commit(self):
+        self.db.commit()
@@ -0,0 +1,27 @@
+import paramiko
+
+from targets.ssh import SSHHostConn
+
+def handle_cmd(conn, input):
+    result, gotRoot = conn.run(input["cmd"])
+    return input["cmd"], result, gotRoot
+
+
+def handle_ssh(target_host, input):
+    user = input["username"]
+    password = input["password"]
+
+    cmd = "tried ssh with username " + user + " and password " + password
+
+    test = SSHHostConn(target_host, user, password)
+    try:
+        test.connect()
+        user = test.run("whoami")
+
+        if user == "root":
+            return cmd, "Login as root was successful"
+        else:
+            return cmd, "Authentication successful, but user is not root"
+
+    except paramiko.ssh_exception.AuthenticationException:
+        return cmd, "Authentication error, credentials are wrong"
@@ -0,0 +1,49 @@
+import tiktoken
+
+from db_storage import DbStorage
+from rich.table import Table
+
+def num_tokens_from_string(model: str, string: str) -> int:
+    """Returns the number of tokens in a text string."""
+    encoding = tiktoken.encoding_for_model(model)
+    return len(encoding.encode(string))
+
+def get_history_table(run_id: int, db: DbStorage, round: int) -> Table:
+    table = Table(title="Executed Command History", show_header=True, show_lines=True)
+    table.add_column("ThinkTime", style="dim")
+    table.add_column("Tokens", style="dim")
+    table.add_column("Cmd")
+    table.add_column("Resp. Size", justify="right")
+    table.add_column("ThinkTime", style="dim")
+    table.add_column("Tokens", style="dim")
+    table.add_column("Reason")
+
+    for i in range(0, round+1):
+        table.add_row(*db.get_round_data(run_id, i))
+
+    return table
+
+def get_cmd_history(model: str, run_id: int, db: DbStorage, limit: int) -> list[str]:
+    result = []
+    rest = limit
+
+    # get commands from db
+    cmds = db.get_cmd_history(run_id)
+
+    for itm in reversed(cmds):
+        size_cmd = num_tokens_from_string(model, itm[0])
+        size_result = num_tokens_from_string(model, itm[1])
+        size = size_cmd + size_result
+
+        if size <= rest:
+            result.append(itm)
+            rest -= size
+        else:
+            # if theres a bit space left, fill that up with parts of the last item
+            if (rest - size_cmd) >= 200:
+                result.append({
+                    "cmd" : itm[0],
+                    "result" : itm[1][:(rest-size_cmd-2)] + ".."
+                })
+            return list(reversed(result))
+    return list(reversed(result))
@@ -1,3 +1,21 @@
+# initial version (tagged as fse23-ivr) using gpt-3.5-turbo
+
+This happened during a recent run:
+
+![Example wintermute run](example_run.png)
+
+Some things to note:
+
+- prompts for GPT-3 are prefixed with `openai-prompt`, the returned command from GPT-3 is prefixed with `openai-next-command` and the result from executing the command with `server-output`
+- the used SSH-library also displays the output produced by the commands executed through SSH --- this is why some stuff appears twice
+- I've added a simple callback that automatically enters the configured account's credentials if sudo prompts for a password
+
+So, what is acutally happening when executing wintermute?
+
+- wintermute executed `id` initially to get the user's id
+- the next command was `sudo -l`, listing the current users sudo permissions
+- wintermute then executes `sudo /bin/bash` and we're dropped into an interactive root shell
+
 # inital running version (~0.0.1)
 
 - simple limitiation to 3k tokens for history