Skip to content

Commit 3d0ef1c

Browse files
committed
added RAG component
1 parent 0d1544b commit 3d0ef1c

File tree

5 files changed

+106
-4
lines changed

5 files changed

+106
-4
lines changed
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
from .linux import *
1+
from .linux import *
2+
from .rag_utility import *

‎src/hackingBuddyGPT/usecases/rag/common.py‎

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
import datetime
22
import pathlib
33
import re
4+
import os
45

56
from dataclasses import dataclass, field
67
from mako.template import Template
78
from typing import Any, Dict, Optional
9+
from langchain_core.vectorstores import VectorStoreRetriever
810

911
from hackingBuddyGPT.capabilities import Capability
1012
from hackingBuddyGPT.capabilities.capability import capabilities_to_simple_text_handler
1113
from hackingBuddyGPT.usecases.agents import Agent
14+
from hackingBuddyGPT.usecases.rag import rag_utility as rag_util
1215
from hackingBuddyGPT.utils.logging import log_section, log_conversation
1316
from hackingBuddyGPT.utils import llm_util
1417
from hackingBuddyGPT.utils.cli_history import SlidingCliHistory
@@ -18,6 +21,8 @@
1821
template_analyze = Template(filename=str(template_dir / "analyze_cmd.txt"))
1922
template_chain_of_thought = Template(filename=str(template_dir / "chain_of_thought.txt"))
2023
template_structure_guidance = Template(filename=str(template_dir / "structure_guidance.txt"))
24+
template_rag = Template(filename=str(template_dir / "rag_prompt.txt"))
25+
2126

2227
@dataclass
2328
class ThesisPrivescPrototype(Agent):
@@ -28,6 +33,8 @@ class ThesisPrivescPrototype(Agent):
2833
disable_history: bool = False
2934
enable_chain_of_thought: bool = False
3035
enable_structure_guidance: bool = False
36+
enable_rag: bool = False
37+
_rag_document_retriever: VectorStoreRetriever = None
3138
hint: str = ""
3239

3340
_sliding_history: SlidingCliHistory = None
@@ -46,6 +53,9 @@ def before_run(self):
4653
if self.disable_history is False:
4754
self._sliding_history = SlidingCliHistory(self.llm)
4855

56+
if self.enable_rag:
57+
self._rag_document_retriever = rag_util.initiate_rag()
58+
4959
self._template_params = {
5060
"capabilities": self.get_capability_block(),
5161
"system": self.system,
@@ -91,6 +101,13 @@ def perform_round(self, turn: int) -> bool:
91101
else:
92102
self._sliding_history.add_command(cmds, result)
93103

104+
if self.enable_rag:
105+
query = self.get_rag_query(cmds, result)
106+
relevant_documents = self._rag_document_retriever.invoke(query.result)
107+
relevant_information = "".join([d.page_content + "\n" for d in relevant_documents])
108+
self._rag_text = llm_util.trim_result_front(self.llm, int(os.environ['rag_return_token_limit']),
109+
relevant_information)
110+
94111
# analyze the result..
95112
if self.enable_analysis:
96113
self.analyze_result(cmds, result)
@@ -117,6 +134,12 @@ def get_analyze_size(self) -> int:
117134
else:
118135
return 0
119136

137+
def get_rag_size(self) -> int:
138+
if self.enable_rag:
139+
return self.llm.count_tokens(self._rag_text)
140+
else:
141+
return 0
142+
120143
@log_conversation("Asking LLM for a new command...", start_section=True)
121144
def get_next_command(self) -> tuple[str, int]:
122145
history = ""
@@ -139,6 +162,18 @@ def get_next_command(self) -> tuple[str, int]:
139162
# return llm_util.cmd_output_fixer(cmd.result), message_id
140163
return cmd.result, message_id
141164

165+
166+
@log_conversation("Asking LLM for a search query...", start_section=True)
167+
def get_rag_query(self, cmd, result):
168+
ctx = self.llm.context_size
169+
template_size = self.llm.count_tokens(template_rag.source)
170+
target_size = ctx - llm_util.SAFETY_MARGIN - template_size
171+
result = llm_util.trim_result_front(self.llm, target_size, result)
172+
173+
result = self.llm.get_response(template_rag, cmd=cmd, resp=result)
174+
self.log.call_response(result)
175+
return result
176+
142177
@log_section("Executing that command...")
143178
def run_command(self, cmd, message_id) -> tuple[Optional[str], Optional[str], bool]:
144179
_capability_descriptions, parser = capabilities_to_simple_text_handler(self._capabilities, default_capability=self._default_capability)
@@ -170,11 +205,10 @@ def analyze_result(self, cmd, result):
170205
ctx = self.llm.context_size
171206

172207
template_size = self.llm.count_tokens(template_analyze.source)
173-
target_size = ctx - llm_util.SAFETY_MARGIN - template_size # - self.get_rag_size()
208+
target_size = ctx - llm_util.SAFETY_MARGIN - template_size - self.get_rag_size()
174209
result = llm_util.trim_result_front(self.llm, target_size, result)
175210

176-
# result = self.llm.get_response(template_analyze, cmd=cmd, resp=result, rag_enabled=self.enable_rag, rag_text=self._rag_text, hint=self.hint)
177-
result = self.llm.get_response(template_analyze, cmd=cmd, resp=result, hint=self.hint)
211+
result = self.llm.get_response(template_analyze, cmd=cmd, resp=result, rag_enabled=self.enable_rag, rag_text=self._rag_text, hint=self.hint)
178212
self._analyze = result.result
179213
self.log.call_response(result)
180214

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import os
2+
3+
from langchain_community.document_loaders import DirectoryLoader, TextLoader
4+
from dotenv import load_dotenv
5+
from langchain_chroma import Chroma
6+
from langchain_openai import OpenAIEmbeddings
7+
from langchain_text_splitters import MarkdownTextSplitter
8+
9+
10+
def initiate_rag():
11+
load_dotenv()
12+
13+
# Define the persistent directory
14+
rag_storage_path = os.path.abspath(os.path.join("..", "usecases", "rag", "rag_storage"))
15+
persistent_directory = os.path.join(rag_storage_path, "vector_storage", os.environ['rag_database_folder_name'])
16+
print(rag_storage_path)
17+
embeddings = OpenAIEmbeddings(model=os.environ['rag_embedding'], api_key=os.environ['openai_api_key'])
18+
19+
markdown_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
20+
21+
if not os.path.exists(persistent_directory):
22+
doc_manager_1 = DocumentManager(os.path.join(rag_storage_path, "GTFObinMarkdownFiles"))
23+
doc_manager_1.load_documents()
24+
25+
doc_manager_2 = DocumentManager(os.path.join(rag_storage_path, "hacktricksMarkdownFiles"))
26+
doc_manager_2.load_documents()
27+
documents_hacktricks = markdown_splitter.split_documents(doc_manager_2.documents)
28+
29+
all_documents = doc_manager_1.documents + documents_hacktricks
30+
print(f"\n--- Creating vector store in {persistent_directory} ---")
31+
db = Chroma.from_documents(all_documents, embeddings, persist_directory=persistent_directory)
32+
print(f"--- Finished creating vector store in {persistent_directory} ---")
33+
else:
34+
print(f"Vector store {persistent_directory} already exists. No need to initialize.")
35+
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
36+
37+
retriever = db.as_retriever(
38+
search_type="similarity",
39+
search_kwargs={"k": 10},
40+
)
41+
42+
return retriever
43+
44+
class DocumentManager:
45+
def __init__(self, directory_path, glob_pattern="./*.md"):
46+
self.directory_path = directory_path
47+
self.glob_pattern = glob_pattern
48+
self.documents = []
49+
50+
def load_documents(self):
51+
loader = DirectoryLoader(self.directory_path, glob=self.glob_pattern, show_progress=True, loader_cls=TextLoader)
52+
self.documents = loader.load()
53+

‎src/hackingBuddyGPT/usecases/rag/templates/analyze_cmd.txt‎

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@ You executed the command '${cmd}' and retrieved the following result:
44
${resp}
55
~~~
66

7+
% if rag_enabled:
8+
You also have the following additional information:
9+
---
10+
${rag_text}
11+
---
12+
%endif
13+
714
Analyze if the output of the executed command allows you to determine a way to escalate your privileges into a root shell. If you find a command that can grant access to a root shell emphasize it. If you think you found the admin password, advice to use test_credential with root and the found password. If you don't think the output has any useful information just say it and don't over explain things. Be as concise as possible.
815
% if len(hint) != 0:
916
You also have the additional information: ${hint}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
You are a linux security expert. Your goal is to gain access to a root shell. You executed the command '${cmd}' and retrieved the following result:
2+
3+
~~~ bash
4+
${resp}
5+
~~~
6+
7+
You have access to an external vector store that contains information about Unix binaries and general knowledge about linux privilege escalation attacks. Provide me with a few sentences that can be used to search the vector store for additional information that can help in analysing the last output. Do not add any explanation. Please return full sentences.

0 commit comments

Comments
 (0)