ipa-lab
diff --git a/‎.env.example‎
Lines changed: 2 additions & 1 deletion b/‎.env.example‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 20 additions & 4 deletions b/‎README.md‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎config.py‎
Lines changed: 24 additions & 0 deletions b/‎config.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎example_run_gpt4.png‎
130 KB b/‎example_run_gpt4.png‎
130 KB
diff --git a/‎history.py‎
Lines changed: 26 additions & 4 deletions b/‎history.py‎
Lines changed: 26 additions & 4 deletions
diff --git a/‎llms/openai.py‎
Lines changed: 7 additions & 15 deletions b/‎llms/openai.py‎
Lines changed: 7 additions & 15 deletions
diff --git a/‎llms/openai_rest.py‎
Lines changed: 16 additions & 0 deletions b/‎llms/openai_rest.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎prompt_helper.py‎
Lines changed: 20 additions & 17 deletions b/‎prompt_helper.py‎
Lines changed: 20 additions & 17 deletions
diff --git a/‎requirements.txt‎
Lines changed: 18 additions & 21 deletions b/‎requirements.txt‎
Lines changed: 18 additions & 21 deletions
diff --git a/‎targets/ssh.py‎
Lines changed: 2 additions & 6 deletions b/‎targets/ssh.py‎
Lines changed: 2 additions & 6 deletions
@@ -1,5 +1,6 @@
 OPENAI_KEY="your-openai-key"
-MODEL="gpt-3.5-turbo"
+MODEL="gpt-4"
+CONTEXT_SIZE=7000
 
 # exchange with the IP of your target VM
 TARGET_IP='enter-the-private-ip-of-some-vm.local'
 
@@ -4,7 +4,7 @@
 
 This is a small python script that I use to prototype some potential use-cases when integrating large language models, such as GPT-3, with security-related tasks.
 
-What is it doing? More or less it creates a SSH connection to a configured virtual machine (I am using vulnerable VMs for that on purpose and then asks GPT-3 to find security vulnerabilities (which it often executes). Evicts a bit of an eerie feeling for me.
+What is it doing? More or less it creates a SSH connection to a configured virtual machine (I am using vulnerable VMs for that on purpose and then asks LLMS such as (GPT-3.5-turbo or GPT-4) to find security vulnerabilities (which it often executes). Evicts a bit of an eerie feeling for me.
 
 ### Vision Paper
 
@@ -29,7 +29,23 @@ series = {ESEC/FSE 2023}
 }
 ~~~
 
-# Example run
+# Example runs
+
+## updated version using GPT-4
+
+This happened during a recent run:
+
+![Example wintermute run](example_run_gpt4.png)
+
+Some things to note:
+
+- the panel labeled 'my new fact list' is generated by the LLM. After each command execution we give the LLM it's current fact list, the executed command, and its output and ask it to generate a new concise fact list.
+- the tabel contains all executed commands. The columns 'success?' and 'reason' are populate by asking the LLM if the executed comamnd (and its output) help with getting root access as well as to reason about the commands output
+- in the bottom you see the last executed command (`/tmp/bash -p`) and it's output.
+
+In this case GPT-4 wanted to exploit a vulnerable cron script (to which it had write access), sadly I forgot to enable cron in the VM.
+
+## initial version (tagged as fse23-ivr) using gpt-3.5-turbo
 
 This happened during a recent run:
 
@@ -50,9 +66,9 @@ So, what is acutally happening when executing wintermute?
 
 ## High-Level Description
 
-This tool uses SSH to connect to a (presumably) vulnerable virtual machine and then asks OpenAI GPT-3 to suggest linux commands that could be used for finding security vulnerabilities or privilege escalatation. The provided command is then executed within the virtual machine, the output fed back to GPT-3 and, finally, a new command is requested from GPT-3..
+This tool uses SSH to connect to a (presumably) vulnerable virtual machine and then asks OpenAI GPT to suggest linux commands that could be used for finding security vulnerabilities or privilege escalatation. The provided command is then executed within the virtual machine, the output fed back to the LLM and, finally, a new command is requested from it..
 
-This tool is only intended for experimenting with this setup, only use it against virtual machines. Never use it in any production or public setup, please also see the disclaimer. GPT-3 can (and will) download external scripts/tools during execution, so please be aware of that.
+This tool is only intended for experimenting with this setup, only use it against virtual machines. Never use it in any production or public setup, please also see the disclaimer. The used LLM can (and will) download external scripts/tools during execution, so please be aware of that.
 
 ## Setup
 
 
@@ -0,0 +1,24 @@
+import os
+
+from dotenv import load_dotenv
+
+def check_config():
+    load_dotenv()
+
+def model():
+    return os.getenv("MODEL")
+
+def context_size():
+    return int(os.getenv("CONTEXT_SIZE"))
+
+def target_ip():
+    return os.getenv('TARGET_IP')
+
+def target_password():
+    return os.getenv("TARGET_PASSWORD")
+
+def target_user():
+    return os.getenv('TARGET_USER')
+
+def openai_key():
+    return os.getenv('OPENAI_KEY')
@@ -1,19 +1,27 @@
 import tiktoken
+import os
+
+from rich.table import Table
 
 def num_tokens_from_string(string: str) -> int:
     """Returns the number of tokens in a text string."""
-    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
+    model = os.getenv("MODEL")
+    encoding = tiktoken.encoding_for_model(model)
     return len(encoding.encode(string))
 
 
 class ResultHistory:
     def __init__(self):
         self.data = []
 
-    def append(self, cmd, result):
+    def append(self, think_time, cmd_type, cmd, result, success, reasoning):
         self.data.append({
             "cmd": cmd,
-            "result": result
+            "result": result,
+            "think_time": think_time,
+            "cmd_type": cmd_type,
+            "success": success,
+            "reasoning": reasoning
         })
 
     def get_full_history(self):
@@ -42,4 +50,18 @@ def get_history(self, limit=3072):
                         "result" : itm["result"][:(rest-size_cmd-2)] + ".."
                     })
                 return list(reversed(result))
-        return list(reversed(result))
+        return list(reversed(result))
+
+    def create_history_table(self):
+        table = Table(show_header=True, show_lines=True)
+        table.add_column("Type", style="dim", width=7)
+        table.add_column("ThinkTime", style="dim")
+        table.add_column("To_Execute")
+        table.add_column("Resp. Size", justify="right")
+        table.add_column("success?", width=8)
+        table.add_column("reason")
+
+        for itm in self.data:
+            table.add_row(itm["cmd_type"], itm["think_time"], itm["cmd"], str(len(itm["result"])), itm["success"], itm["reasoning"])
+
+        return table
@@ -1,20 +1,12 @@
 import openai
-import os
+import config
 
-openapi_model : str = ''
-
-def openai_config():
-    global openapi_model
-
-    api_key = os.getenv('OPENAI_KEY')
-    model = os.getenv('MODEL')
+def get_openai_response(cmd):
 
-    if api_key != '' and model != '':
-        openai.api_key = api_key
-        openapi_model = model
-    else:
+    if config.model() == '' and config.openai_key() == '':
         raise Exception("please set OPENAI_KEY and MODEL through environment variables!")
 
-def get_openai_response(cmd):
-    completion = openai.ChatCompletion.create(model=openapi_model, messages=[{"role": "user", "content" : cmd}])
-    return completion.choices[0].message.content
+    openai.api_key = config.openai_key()
+
+    completion = openai.ChatCompletion.create(model=config.model(), messages=[{"role": "user", "content" : cmd}])
+    return completion.choices[0].message.content
@@ -0,0 +1,16 @@
+import config
+import requests
+
+
+def get_openai_response(cmd):
+    if config.model() == '' and config.openai_key() == '':
+        raise Exception("please set OPENAI_KEY and MODEL through environment variables!")
+    openapi_key = config.openai_key()
+    openapi_model = config.model()
+
+    headers = {"Authorization": f"Bearer {openapi_key}"}
+    data = {'model': openapi_model, 'messages': [{'role': 'user', 'content': cmd}]}
+    response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data).json()
+
+    print(str(response))
+    return response['choices'][0]['message']['content']
@@ -1,26 +1,29 @@
 import logging
+import json
+import time
 
-from colorama import Fore, Style
 from datetime import datetime
 from mako.template import Template
 
-from llms.openai import get_openai_response
+class LLM:
+    def __init__(self, llm_connection):
+        self.connection = llm_connection
 
-log = logging.getLogger()
-filename = datetime.now().strftime('logs/run_%Y%m%d%m-%H%M.log')
-log.addHandler(logging.FileHandler(filename))
+        # prepare logging
+        self.log = logging.getLogger()
+        filename = datetime.now().strftime('logs/run_%Y%m%d%m-%H%M.log')
+        self.log.addHandler(logging.FileHandler(filename))
+        self.get_openai_response = llm_connection
 
-def output_log(kind, msg):
-    print("[" + Fore.RED + kind + Style.RESET_ALL +"]: " + msg)
-    log.warning("[" + kind + "] " + msg)
+    # helper for generating and executing LLM prompts from a template
+    def create_and_ask_prompt(self, template_file, log_prefix, **params):
 
-# helper for generating and executing LLM prompts from a template
-def create_and_ask_prompt(template_file, log_prefix, **params):
-    global logs
+        template = Template(filename='templates/' + template_file)
+        prompt = template.render(**params)
+        self.log.warning("[" + log_prefix + "-prompt] " + prompt)
+        tic = time.perf_counter()
+        result = self.get_openai_response(prompt)
+        toc = time.perf_counter()
+        self.log.warning("[" + log_prefix + "-answer] " + result)
 
-    template = Template(filename='templates/' + template_file)
-    prompt = template.render(**params)
-    output_log(log_prefix + "-prompt", prompt)
-    result = get_openai_response(prompt)
-    output_log(log_prefix + "-answer", result)
-    return result
+        return json.loads(result), str(toc-tic)
@@ -1,28 +1,25 @@
-aiohttp==3.8.4
-aiosignal==1.3.1
-async-timeout==4.0.2
-attrs==23.1.0
 bcrypt==4.0.1
-certifi==2022.12.7
+certifi==2023.7.22
 cffi==1.15.1
-charset-normalizer==3.1.0
-colorama==0.4.6
-cryptography==40.0.2
-fabric==3.0.0
-frozenlist==1.3.3
+charset-normalizer==3.2.0
+cryptography==41.0.3
+decorator==5.1.1
+Deprecated==1.2.14
+fabric==3.2.2
 idna==3.4
-invoke==2.0.0
+invoke==2.2.0
 Mako==1.2.4
-MarkupSafe==2.1.2
-multidict==6.0.4
-openai==0.27.4
-paramiko==3.1.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+mdurl==0.1.2
+paramiko==3.3.1
 pycparser==2.21
+Pygments==2.16.1
 PyNaCl==1.5.0
 python-dotenv==1.0.0
-regex==2023.3.23
-requests==2.28.2
-tiktoken==0.3.3
-tqdm==4.65.0
-urllib3==1.26.15
-yarl==1.9.2
+regex==2023.8.8
+requests==2.31.0
+rich==13.5.2
+tiktoken==0.4.0
+urllib3==2.0.4
+wrapt==1.15.0
@@ -1,12 +1,7 @@
-import os
-
 from fabric import Connection
 from invoke import Responder
 
-def get_ssh_connection():
-    ip = os.getenv('TARGET_IP')
-    user = os.getenv('TARGET_USER')
-    password = os.getenv('TARGET_PASSWORD')
+def get_ssh_connection(ip, user, password):
 
     if ip != '' and user != '' and password != '':
         return SSHHostConn(ip, user, password)
@@ -31,6 +26,7 @@ def connect(self):
             connect_kwargs={"password": self.password},
         )
         self.conn=conn
+        self.conn.open()
 
     def run(self, cmd):
         sudopass = Responder(