Merge pull request ipa-lab#76 from ipa-lab/development_without_spacy

andreashappe · web-flow · commit 38bfbc0bd81d · 2024-08-05T14:27:37.000+02:00
Development without spacy
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,8 +35,8 @@ dependencies = [
 	'pypsexec == 0.3.0',
 	'pydantic == 2.8.2',
 	'openai == 1.28.0',
-	'spacy',
-	'BeautifulSoup4'
+	'BeautifulSoup4',
+	'nltk'
 ]
 
 [project.urls]
diff --git a/src/hackingBuddyGPT/capabilities/submit_http_method.py b/src/hackingBuddyGPT/capabilities/submit_http_method.py
@@ -16,6 +16,7 @@ class SubmitHTTPMethod(Capability):
     _client = requests.Session()
     host: str
     follow_redirects: bool = False
+    success_function: Callable[[], None] = None
 
 
     submitted_valid_http_methods: Set[str] = field(default_factory=set, init=False)
@@ -67,7 +68,11 @@ def __call__(self, method: Literal["GET", "HEAD", "POST", "PUT", "DELETE", "OPTI
             return str(e)
 
         headers = "\r\n".join(f"{k}: {v}" for k, v in resp.headers.items())
-
+        if len(self.submitted_valid_http_methods) == len(self.valid_http_methods):
+            if self.success_function is not None:
+                self.success_function()
+            else:
+                return "All methods submitted, congratulations"
         # turn the response into "plain text format" for responding to the prompt
         return f"HTTP/1.1 {resp.status_code} {resp.reason}\r\n{headers}\r\n\r\n{resp.text}"""
 
diff --git a/src/hackingBuddyGPT/cli/wintermute.py b/src/hackingBuddyGPT/cli/wintermute.py
@@ -8,12 +8,13 @@ def main():
     parser = argparse.ArgumentParser()
     subparser = parser.add_subparsers(required=True)
     for name, use_case in use_cases.items():
-        use_case.build_parser(subparser.add_parser(
+        subb = subparser.add_parser(
             name=use_case.name,
             help=use_case.description
-        ))
-
-    parsed = parser.parse_args(sys.argv[1:])
+        )
+        use_case.build_parser(subb)
+    x= sys.argv[1:]
+    parsed = parser.parse_args(x)
     instance = parsed.use_case(parsed)
     instance.init()
     instance.run()
diff --git a/src/hackingBuddyGPT/usecases/web_api_testing/prompt_engineer.py b/src/hackingBuddyGPT/usecases/web_api_testing/prompt_engineer.py
@@ -1,4 +1,6 @@
-import spacy
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
 from instructor.retry import InstructorRetryException
 
 
@@ -32,19 +34,14 @@ def __init__(self, strategy, llm_handler, history, schemas, response_handler):
         self.response_handler = response_handler
         self.llm_handler = llm_handler
         self.round = 0
-        self.found_endpoints = []
+        self.found_endpoints = ["/"]
+        self.endpoint_methods = {}
+        self.endpoint_found_methods = {}
         model_name = "en_core_web_sm"
 
-        # Check if the model is already installed
-        from spacy.util import is_package
-        if not is_package(model_name):
-            print(f"Model '{model_name}' is not installed. Installing now...")
-            spacy.cli.download(model_name)
-
-        # Load the model
-        self.nlp = spacy.load(model_name)
-
-        self.nlp = spacy.load("en_core_web_sm")
+        # Check if the models are already installed
+        nltk.download('punkt')
+        nltk.download('stopwords')
         self._prompt_history = history
         self.prompt = self._prompt_history
         self.previous_prompt = self._prompt_history[self.round]["content"]
@@ -105,7 +102,7 @@ def get_http_action_template(self, method):
 
         else:
             return (
-                f"Create HTTPRequests of type {method} considering the found schemas: {self.schemas} and understand the responses. Ensure that they are correct requests.")
+                f"Create HTTPRequests of type {method} considering only the object with id=1 for the endpoint and understand the responses. Ensure that they are correct requests.")
 
 
     def chain_of_thought(self, doc=False, hint=""):
@@ -129,34 +126,56 @@ def chain_of_thought(self, doc=False, hint=""):
             "Make the OpenAPI specification available to developers by incorporating it into your API documentation site and keep the documentation up to date with API changes."
         ]
 
-        http_methods = [ "POST", "DELETE", "PUT"]
+        http_methods = [  "PUT", "DELETE"]
         http_phase = {
-            6: http_methods[0],
-            16: http_methods[1], # Delete one of instance of this:{self.llm_handler.get_created_objects()}",
-            20: http_methods[2]
+            5: http_methods[0],
+            10: http_methods[1]
         }
 
         if doc:
-            if self.round <= 5:
+            if self.round < 5:
 
                 chain_of_thought_steps = [
                                              f"Identify all available endpoints via GET Requests. Exclude those in this list: {self.found_endpoints}", f"Note down the response structures, status codes, and headers for each endpoint.",
                                              f"For each endpoint, document the following details: URL, HTTP method, "
                                              f"query parameters and path variables, expected request body structure for  requests, response structure for successful and error responses."
                                          ] + common_steps
             else:
-                phase = http_phase.get(min(filter(lambda x: self.round <= x, http_phase.keys())))
-                print(f'phase:{phase}')
-                if phase != "DELETE":
-                    chain_of_thought_steps = [
-                                             f"Identify all valid calls for HTTP method {phase}.",
+                if self.round <= 10:
+                    phase = http_phase.get(min(filter(lambda x: self.round <= x, http_phase.keys())))
+                    print(f'phase:{phase}')
+                    if phase != "DELETE":
+                        chain_of_thought_steps = [
+                                             f"Identify for all endpoints {self.found_endpoints} excluding {self.endpoint_found_methods[phase]} a valid HTTP method {phase} call.",
                                              self.get_http_action_template(phase)
                                          ] + common_steps
-                else:
-                    chain_of_thought_steps = [
+                    else:
+                        chain_of_thought_steps = [
                                                  f"Check for all endpoints the DELETE method. Delete the first instance for all endpoints. ",
                                                  self.get_http_action_template(phase)
                                              ] + common_steps
+                else:
+                    endpoints_needing_help = []
+                    endpoints_and_needed_methods = {}
+
+                    # Standard HTTP methods
+                    http_methods = {"GET", "POST", "PUT", "DELETE"}
+
+                    for endpoint in self.endpoint_methods:
+                        # Calculate the missing methods for the current endpoint
+                        missing_methods = http_methods - set(self.endpoint_methods[endpoint])
+
+                        if len(self.endpoint_methods[endpoint]) < 4:
+                            endpoints_needing_help.append(endpoint)
+                            # Add the missing methods to the dictionary
+                            endpoints_and_needed_methods[endpoint] = list(missing_methods)
+
+                    print(f'endpoints_and_needed_methods: {endpoints_and_needed_methods}')
+                    print(f'first endpoint in list: {endpoints_needing_help[0]}')
+                    print(f'methods needed for first endpoint: {endpoints_and_needed_methods[endpoints_needing_help[0]][0]}')
+
+                    chain_of_thought_steps = [f"For enpoint {endpoints_needing_help[0]} find this missing method :{endpoints_and_needed_methods[endpoints_needing_help[0]][0]} "
+                                              f"If all the HTTP methods have already been found for an endpoint, then do not include this endpoint in your search. ",]
 
         else:
             if self.round == 0:
@@ -175,19 +194,19 @@ def chain_of_thought(self, doc=False, hint=""):
 
     def token_count(self, text):
         """
-        Counts the number of word tokens in the provided text using spaCy's tokenizer.
-
-        Args:
-            text (str): The input text to tokenize and count.
-
-        Returns:
-            int: The number of tokens in the input text.
-        """
-        # Process the text through spaCy's pipeline
-        doc = self.nlp(text)
-        # Count tokens that aren't punctuation marks
-        tokens = [token for token in doc if not token.is_punct]
-        return len(tokens)
+            Counts the number of word tokens in the provided text using NLTK's tokenizer.
+
+            Args:
+                text (str): The input text to tokenize and count.
+
+            Returns:
+                int: The number of tokens in the input text.
+            """
+        # Tokenize the text using NLTK
+        tokens = word_tokenize(text)
+        # Filter out punctuation marks
+        words = [token for token in tokens if token.isalnum()]
+        return len(words)
 
 
     def check_prompt(self, previous_prompt, chain_of_thought_steps, max_tokens=900):
diff --git a/src/hackingBuddyGPT/usecases/web_api_testing/simple_openapi_documentation.py b/src/hackingBuddyGPT/usecases/web_api_testing/simple_openapi_documentation.py
@@ -29,19 +29,19 @@ class SimpleWebAPIDocumentation(Agent):
     _all_http_methods_found: bool = False
 
     # Description for expected HTTP methods
-    http_method_description: str = parameter(
+    _http_method_description: str = parameter(
         desc="Pattern description for expected HTTP methods in the API response",
         default="A string that represents an HTTP method (e.g., 'GET', 'POST', etc.)."
     )
 
     # Template for HTTP methods in API requests
-    http_method_template: str = parameter(
+    _http_method_template: str = parameter(
         desc="Template to format HTTP methods in API requests, with {method} replaced by actual HTTP method names.",
         default="{method}"
     )
 
     # List of expected HTTP methods
-    http_methods: str = parameter(
+    _http_methods: str = parameter(
         desc="Expected HTTP methods in the API, as a comma-separated list.",
         default="GET,POST,PUT,PATCH,DELETE"
     )
@@ -75,13 +75,23 @@ def _setup_initial_prompt(self):
 
 
     def all_http_methods_found(self):
-        self._log.console.print(Panel("All HTTP methods found! Congratulations!", title="system"))
-        self._all_http_methods_found = True
+        print(f'found endpoints:{self.documentation_handler.endpoint_methods.items()}')
+        print(f'found endpoints values:{self.documentation_handler.endpoint_methods.values()}')
+
+        found_endpoints = sum(len(value_list) for value_list in self.documentation_handler.endpoint_methods.values())
+        expected_endpoints = len(self.documentation_handler.endpoint_methods.keys())*4
+        print(f'found endpoints:{found_endpoints}')
+        print(f'expected endpoints:{expected_endpoints}')
+        print(f'correct? {found_endpoints== expected_endpoints}')
+        if found_endpoints== expected_endpoints or found_endpoints == expected_endpoints -1:
+            return True
+        else:
+            return False
 
     def perform_round(self, turn: int):
         prompt = self.prompt_engineer.generate_prompt(doc=True)
         response, completion = self.llm_handler.call_llm(prompt)
-        self._handle_response(completion, response)
+        return self._handle_response(completion, response)
 
     def _handle_response(self, completion, response):
         message = completion.choices[0].message
@@ -101,8 +111,17 @@ def _handle_response(self, completion, response):
                 self.prompt_engineer.found_endpoints = self.documentation_handler.update_openapi_spec(response, result)
                 self.documentation_handler.write_openapi_to_yaml()
                 self.prompt_engineer.schemas = self.documentation_handler.schemas
+                from collections import defaultdict
+                http_methods_dict = defaultdict(list)
+
+                # Iterate through the original dictionary
+                for endpoint, methods in self.documentation_handler.endpoint_methods.items():
+                    for method in methods:
+                        http_methods_dict[method].append(endpoint)
+                self.prompt_engineer.endpoint_found_methods =  http_methods_dict
+                self.prompt_engineer.endpoint_methods = self.documentation_handler.endpoint_methods
                 print(f'SCHEMAS:{self.prompt_engineer.schemas}')
-        return self._all_http_methods_found
+        return self.all_http_methods_found()
 
 
 
diff --git a/src/hackingBuddyGPT/usecases/web_api_testing/simple_web_api_testing.py b/src/hackingBuddyGPT/usecases/web_api_testing/simple_web_api_testing.py
@@ -95,7 +95,7 @@ def _setup_capabilities(self):
         methods_set = {self.http_method_template.format(method=method) for method in self.http_methods.split(",")}
         notes = self._context["notes"]
         self._capabilities = {
-            "submit_http_method": SubmitHTTPMethod(self.http_method_description, methods_set, self.host),
+            "submit_http_method": HTTPRequest(self.host),
             "http_request": HTTPRequest(self.host),
             "record_note": RecordNote(notes)
         }
@@ -134,8 +134,7 @@ def _handle_response(self, completion, response):
             result_str = self.response_handler.parse_http_status_line(result)
             self._prompt_history.append(tool_message(result_str, tool_call_id))
 
-        return self._all_http_methods_found
-
+        return self.all_http_methods_found()
 @use_case("Minimal implementation of a web API testing use case")
 class SimpleWebAPITestingUseCase(AutonomousAgentUseCase[SimpleWebAPITesting]):
     pass
diff --git a/src/hackingBuddyGPT/usecases/web_api_testing/utils/documentation_handler.py b/src/hackingBuddyGPT/usecases/web_api_testing/utils/documentation_handler.py
@@ -29,6 +29,7 @@ def __init__(self, llm_handler, response_handler):
         """
         self.response_handler = response_handler
         self.schemas = {}
+        self.endpoint_methods ={}
         self.filename = f"openapi_spec_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.yaml"
         self.openapi_spec = {
             "openapi": "3.0.0",
@@ -42,14 +43,17 @@ def __init__(self, llm_handler, response_handler):
             "components": {"schemas": {}}
         }
         self.llm_handler = llm_handler
-        self.api_key = llm_handler.llm.api_key
+        #self.api_key = llm_handler.llm.api_key
         current_path = os.path.dirname(os.path.abspath(__file__))
         self.file_path = os.path.join(current_path, "openapi_spec")
         self.file = os.path.join(self.file_path, self.filename)
         self._capabilities = {
             "yaml": YAMLFile()
         }
 
+    def partial_match(self, element, string_list):
+        return any(element in string or string in element for string in string_list)
+
     def update_openapi_spec(self, resp, result):
         """
         Updates the OpenAPI specification based on the API response provided.
@@ -67,31 +71,51 @@ def update_openapi_spec(self, resp, result):
             method = request.method
             print(f'method: {method}')
             # Ensure that path and method are not None and method has no numeric characters
+            # Ensure path and method are valid and method has no numeric characters
             if path and method:
+                endpoint_methods = self.endpoint_methods
+                endpoints = self.openapi_spec['endpoints']
+                x = path.split('/')[1]
+
                 # Initialize the path if not already present
-                if path not in self.openapi_spec['endpoints']:
-                    self.openapi_spec['endpoints'][path] = {}
+                if path not in endpoints and x != "":
+                    endpoints[path] = {}
+                    if '1' not in path:
+                        endpoint_methods[path] = []
+
                 # Update the method description within the path
-                example, reference, self.openapi_spec = self.response_handler.parse_http_response_to_openapi_example(self.openapi_spec, result, path, method)
+                example, reference, self.openapi_spec = self.response_handler.parse_http_response_to_openapi_example(
+                    self.openapi_spec, result, path, method
+                )
                 self.schemas = self.openapi_spec["components"]["schemas"]
-                if example is not None or reference is not None:
-                    self.openapi_spec['endpoints'][path][method.lower()] = {
+
+                if example or reference:
+                    endpoints[path][method.lower()] = {
                         "summary": f"{method} operation on {path}",
                         "responses": {
                             "200": {
                                 "description": "Successful response",
                                 "content": {
                                     "application/json": {
-                                        "schema": {
-                                            "$ref": reference
-                                        },
+                                        "schema": {"$ref": reference},
                                         "examples": example
                                     }
                                 }
                             }
                         }
                     }
-            return  list(self.openapi_spec['endpoints'].keys())
+
+                    if '1' not in path and x != "":
+                        endpoint_methods[path].append(method)
+                    elif self.partial_match(x, endpoints.keys()):
+                        path = f"/{x}"
+                        print(f'endpoint methods = {endpoint_methods}')
+                        print(f'new path:{path}')
+                        endpoint_methods[path].append(method)
+
+                    endpoint_methods[path] = list(set(endpoint_methods[path]))
+
+            return list(endpoints.keys())
 
     def write_openapi_to_yaml(self):
         """
diff --git a/tests/test_web_api_documentation.py b/tests/test_web_api_documentation.py
diff --git a/tests/test_web_api_testing.py b/tests/test_web_api_testing.py