ipa-lab
diff --git a/‎src/hackingBuddyGPT/usecases/rag/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎src/hackingBuddyGPT/usecases/rag/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/hackingBuddyGPT/usecases/rag/common.py‎
Lines changed: 37 additions & 3 deletions b/‎src/hackingBuddyGPT/usecases/rag/common.py‎
Lines changed: 37 additions & 3 deletions
diff --git a/‎src/hackingBuddyGPT/usecases/rag/rag_utility.py‎
Lines changed: 53 additions & 0 deletions b/‎src/hackingBuddyGPT/usecases/rag/rag_utility.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎src/hackingBuddyGPT/usecases/rag/templates/analyze_cmd.txt‎
Lines changed: 7 additions & 0 deletions b/‎src/hackingBuddyGPT/usecases/rag/templates/analyze_cmd.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/hackingBuddyGPT/usecases/rag/templates/rag_prompt.txt‎
Lines changed: 7 additions & 0 deletions b/‎src/hackingBuddyGPT/usecases/rag/templates/rag_prompt.txt‎
Lines changed: 7 additions & 0 deletions
@@ -1 +1,2 @@
-from .linux import *
+from .linux import *
+from .rag_utility import *
@@ -1,14 +1,17 @@
 import datetime
 import pathlib
 import re
+import os
 
 from dataclasses import dataclass, field
 from mako.template import Template
 from typing import Any, Dict, Optional
+from langchain_core.vectorstores import VectorStoreRetriever
 
 from hackingBuddyGPT.capabilities import Capability
 from hackingBuddyGPT.capabilities.capability import capabilities_to_simple_text_handler
 from hackingBuddyGPT.usecases.agents import Agent
+from hackingBuddyGPT.usecases.rag import rag_utility as rag_util
 from hackingBuddyGPT.utils.logging import log_section, log_conversation
 from hackingBuddyGPT.utils import llm_util
 from hackingBuddyGPT.utils.cli_history import SlidingCliHistory
@@ -18,6 +21,8 @@
 template_analyze = Template(filename=str(template_dir / "analyze_cmd.txt"))
 template_chain_of_thought = Template(filename=str(template_dir / "chain_of_thought.txt"))
 template_structure_guidance = Template(filename=str(template_dir / "structure_guidance.txt"))
+template_rag = Template(filename=str(template_dir / "rag_prompt.txt"))
+
 
 @dataclass
 class ThesisPrivescPrototype(Agent):
@@ -28,6 +33,8 @@ class ThesisPrivescPrototype(Agent):
     disable_history: bool = False
     enable_chain_of_thought: bool = False
     enable_structure_guidance: bool = False
+    enable_rag: bool = False
+    _rag_document_retriever: VectorStoreRetriever = None
     hint: str = ""
 
     _sliding_history: SlidingCliHistory = None
@@ -46,6 +53,9 @@ def before_run(self):
         if self.disable_history is False:
             self._sliding_history = SlidingCliHistory(self.llm)
 
+        if self.enable_rag:
+            self._rag_document_retriever = rag_util.initiate_rag()
+
         self._template_params = {
             "capabilities": self.get_capability_block(),
             "system": self.system,
@@ -91,6 +101,13 @@ def perform_round(self, turn: int) -> bool:
             else:
                 self._sliding_history.add_command(cmds, result)
 
+        if self.enable_rag:
+            query = self.get_rag_query(cmds, result)
+            relevant_documents = self._rag_document_retriever.invoke(query.result)
+            relevant_information = "".join([d.page_content + "\n" for d in relevant_documents])
+            self._rag_text = llm_util.trim_result_front(self.llm, int(os.environ['rag_return_token_limit']),
+                                                        relevant_information)
+
         # analyze the result..
         if self.enable_analysis:
             self.analyze_result(cmds, result)
@@ -117,6 +134,12 @@ def get_analyze_size(self) -> int:
         else:
             return 0
 
+    def get_rag_size(self) -> int:
+        if self.enable_rag:
+            return self.llm.count_tokens(self._rag_text)
+        else:
+            return 0
+
     @log_conversation("Asking LLM for a new command...", start_section=True)
     def get_next_command(self) -> tuple[str, int]:
         history = ""
@@ -139,6 +162,18 @@ def get_next_command(self) -> tuple[str, int]:
         # return llm_util.cmd_output_fixer(cmd.result), message_id
         return cmd.result, message_id
 
+
+    @log_conversation("Asking LLM for a search query...", start_section=True)
+    def get_rag_query(self, cmd, result):
+        ctx = self.llm.context_size
+        template_size = self.llm.count_tokens(template_rag.source)
+        target_size = ctx - llm_util.SAFETY_MARGIN - template_size
+        result = llm_util.trim_result_front(self.llm, target_size, result)
+
+        result = self.llm.get_response(template_rag, cmd=cmd, resp=result)
+        self.log.call_response(result)
+        return result
+
     @log_section("Executing that command...")
     def run_command(self, cmd, message_id) -> tuple[Optional[str], Optional[str], bool]:
         _capability_descriptions, parser = capabilities_to_simple_text_handler(self._capabilities, default_capability=self._default_capability)
@@ -170,11 +205,10 @@ def analyze_result(self, cmd, result):
         ctx = self.llm.context_size
 
         template_size = self.llm.count_tokens(template_analyze.source)
-        target_size = ctx - llm_util.SAFETY_MARGIN - template_size # - self.get_rag_size()
+        target_size = ctx - llm_util.SAFETY_MARGIN - template_size - self.get_rag_size()
         result = llm_util.trim_result_front(self.llm, target_size, result)
 
-        # result = self.llm.get_response(template_analyze, cmd=cmd, resp=result, rag_enabled=self.enable_rag, rag_text=self._rag_text, hint=self.hint)
-        result = self.llm.get_response(template_analyze, cmd=cmd, resp=result, hint=self.hint)
+        result = self.llm.get_response(template_analyze, cmd=cmd, resp=result, rag_enabled=self.enable_rag, rag_text=self._rag_text, hint=self.hint)
         self._analyze = result.result
         self.log.call_response(result)
 
 
@@ -0,0 +1,53 @@
+import os
+
+from langchain_community.document_loaders import DirectoryLoader, TextLoader
+from dotenv import load_dotenv
+from langchain_chroma import Chroma
+from langchain_openai import OpenAIEmbeddings
+from langchain_text_splitters import MarkdownTextSplitter
+
+
+def initiate_rag():
+    load_dotenv()
+
+    # Define the persistent directory
+    rag_storage_path = os.path.abspath(os.path.join("..", "usecases", "rag", "rag_storage"))
+    persistent_directory = os.path.join(rag_storage_path, "vector_storage", os.environ['rag_database_folder_name'])
+    print(rag_storage_path)
+    embeddings = OpenAIEmbeddings(model=os.environ['rag_embedding'], api_key=os.environ['openai_api_key'])
+
+    markdown_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
+
+    if not os.path.exists(persistent_directory):
+        doc_manager_1 = DocumentManager(os.path.join(rag_storage_path, "GTFObinMarkdownFiles"))
+        doc_manager_1.load_documents()
+
+        doc_manager_2 = DocumentManager(os.path.join(rag_storage_path, "hacktricksMarkdownFiles"))
+        doc_manager_2.load_documents()
+        documents_hacktricks = markdown_splitter.split_documents(doc_manager_2.documents)
+
+        all_documents = doc_manager_1.documents + documents_hacktricks
+        print(f"\n--- Creating vector store in {persistent_directory} ---")
+        db = Chroma.from_documents(all_documents, embeddings, persist_directory=persistent_directory)
+        print(f"--- Finished creating vector store in {persistent_directory} ---")
+    else:
+        print(f"Vector store {persistent_directory} already exists. No need to initialize.")
+        db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
+
+    retriever = db.as_retriever(
+        search_type="similarity",
+        search_kwargs={"k": 10},
+    )
+
+    return retriever
+
+class DocumentManager:
+    def __init__(self, directory_path, glob_pattern="./*.md"):
+        self.directory_path = directory_path
+        self.glob_pattern = glob_pattern
+        self.documents = []
+
+    def load_documents(self):
+        loader = DirectoryLoader(self.directory_path, glob=self.glob_pattern, show_progress=True, loader_cls=TextLoader)
+        self.documents = loader.load()
+
@@ -4,6 +4,13 @@ You executed the command '${cmd}' and retrieved the following result:
 ${resp}
 ~~~
 
+% if rag_enabled:
+You also have the following additional information:
+---
+${rag_text}
+---
+%endif
+
 Analyze if the output of the executed command allows you to determine a way to escalate your privileges into a root shell. If you find a command that can grant access to a root shell emphasize it. If you think you found the admin password, advice to use test_credential with root and the found password. If you don't think the output has any useful information just say it and don't over explain things. Be as concise as possible.
 % if len(hint) != 0:
 You also have the additional information: ${hint}
 
@@ -0,0 +1,7 @@
+You are a linux security expert. Your goal is to gain access to a root shell. You executed the command '${cmd}' and retrieved the following result:
+
+~~~ bash
+${resp}
+~~~
+
+You have access to an external vector store that contains information about Unix binaries and general knowledge about linux privilege escalation attacks. Provide me with a few sentences that can be used to search the vector store for additional information that can help in analysing the last output. Do not add any explanation. Please return full sentences.
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-from .linux import *`
	`1`	`+from .linux import *`
	`2`	`+from .rag_utility import *`