MAINT: Improve docstrings in pyrit/memory (#1176)

balancehat · balancehat · commit f5e7a323bf90 · 2025-12-05T10:17:15.000-05:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -251,7 +251,7 @@ extend-select = [
 # Temporary ignores for pyrit/ subdirectories until issue #1176
 # https://github.com/Azure/PyRIT/issues/1176 is fully resolved
 # TODO: Remove these ignores once the issues are fixed
-"pyrit/{auxiliary_attacks,exceptions,executor,memory,models,prompt_converter,prompt_normalizer,prompt_target,score,ui}/**/*.py" = ["D101", "D102", "D103", "D104", "D105", "D106", "D107", "D401", "D404", "D417", "D418", "DOC102", "DOC201", "DOC202", "DOC402", "DOC501"]
+"pyrit/{auxiliary_attacks,exceptions,executor,models,prompt_converter,prompt_normalizer,prompt_target,score,ui}/**/*.py" = ["D101", "D102", "D103", "D104", "D105", "D106", "D107", "D401", "D404", "D417", "D418", "DOC102", "DOC201", "DOC202", "DOC402", "DOC501"]
 "pyrit/__init__.py" = ["D104"]
 
 [tool.ruff.lint.pydocstyle]
diff --git a/pyrit/memory/__init__.py b/pyrit/memory/__init__.py
@@ -1,6 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+"""
+Provide functionality for storing and retrieving conversation history and embeddings.
+
+This package defines the core `MemoryInterface` and concrete implementations for different storage backends.
+"""
+
 from pyrit.memory.memory_models import EmbeddingDataEntry, PromptMemoryEntry, SeedEntry, AttackResultEntry
 from pyrit.memory.memory_interface import MemoryInterface
 
diff --git a/pyrit/memory/azure_sql_memory.py b/pyrit/memory/azure_sql_memory.py
@@ -60,6 +60,19 @@ def __init__(
         results_sas_token: Optional[str] = None,
         verbose: bool = False,
     ):
+        """
+        Initialize an Azure SQL Memory backend.
+
+        Args:
+            connection_string (Optional[str]): The connection string for the Azure Sql Database. If not provided,
+                it falls back to the 'AZURE_SQL_DB_CONNECTION_STRING' environment variable.
+            results_container_url (Optional[str]): The URL to an Azure Storage Container. If not provided,
+                it falls back to the 'AZURE_STORAGE_ACCOUNT_DB_DATA_CONTAINER_URL' environment variable.
+            results_sas_token (Optional[str]): The Shared Access Signature (SAS) token for the storage container. 
+                If not provided, falls back to the 'AZURE_STORAGE_ACCOUNT_DB_DATA_SAS_TOKEN' environment variable.
+            verbose (bool): Whether to enable verbose logging for the database engine. Defaults to False.
+        """
+        self._init_storage_io()
         self._connection_string = default_values.get_required_value(
             env_var_name=self.AZURE_SQL_DB_CONNECTION_STRING, passed_value=connection_string
         )
@@ -114,7 +127,7 @@ def _init_storage_io(self):
 
     def _create_auth_token(self) -> None:
         """
-        Creates an Azure Entra ID access token.
+        Create an Azure Entra ID access token.
         Stores the token and its expiry time.
         """
         azure_auth = AzureAuth(token_scope=self.TOKEN_URL)
@@ -133,13 +146,19 @@ def _refresh_token_if_needed(self) -> None:
 
     def _create_engine(self, *, has_echo: bool) -> Engine:
         """
-        Creates the SQLAlchemy engine for Azure SQL Server.
+        Create the SQLAlchemy engine for Azure SQL Server.
 
         Creates an engine bound to the specified server and database. The `has_echo` parameter
         controls the verbosity of SQL execution logging.
 
         Args:
             has_echo (bool): Flag to enable detailed SQL execution logging.
+
+        Returns:
+            Engine: SQLAlchemy engine bound to the AZURE SQL Database.
+
+        Raises:
+            SQLAlchemyError: If the engine creation fails.
         """
         try:
             # Create the SQLAlchemy engine.
@@ -156,6 +175,8 @@ def _create_engine(self, *, has_echo: bool) -> Engine:
 
     def _enable_azure_authorization(self) -> None:
         """
+        Enable Azure token-based authorization for SQL connections.
+        
         The following is necessary because of how SQLAlchemy and PyODBC handle connection creation. In PyODBC, the
         token is passed outside the connection string in the `connect()` method. Since SQLAlchemy lazy-loads
         its connections, we need to set this as a separate argument to the `connect()` method. In SQLALchemy
@@ -184,7 +205,7 @@ def provide_token(_dialect, _conn_rec, cargs, cparams):
 
     def _create_tables_if_not_exist(self):
         """
-        Creates all tables defined in the Base metadata, if they don't already exist in the database.
+        Create all tables defined in the Base metadata, if they don't already exist in the database.
 
         Raises:
             Exception: If there's an issue creating the tables in the database.
@@ -198,7 +219,7 @@ def _create_tables_if_not_exist(self):
 
     def _add_embeddings_to_memory(self, *, embedding_data: Sequence[EmbeddingDataEntry]) -> None:
         """
-        Inserts embedding data into memory storage.
+        Insert embedding data into memory storage.
         """
         self._insert_entries(entries=embedding_data)
 
@@ -295,7 +316,7 @@ def _get_seed_metadata_conditions(self, *, metadata: dict[str, Union[str, int]])
 
     def _get_attack_result_harm_category_condition(self, *, targeted_harm_categories: Sequence[str]) -> Any:
         """
-        SQL Azure implementation for filtering AttackResults by targeted harm categories.
+        Get the SQL Azure implementation for filtering AttackResults by targeted harm categories.
 
         Uses JSON_QUERY() function specific to SQL Azure to check if categories exist in the JSON array.
 
@@ -333,7 +354,7 @@ def _get_attack_result_harm_category_condition(self, *, targeted_harm_categories
 
     def _get_attack_result_label_condition(self, *, labels: dict[str, str]) -> Any:
         """
-        SQL Azure implementation for filtering AttackResults by labels.
+        Get the SQL Azure implementation for filtering AttackResults by labels.
 
         Uses JSON_VALUE() function specific to SQL Azure with parameterized queries.
 
@@ -364,7 +385,7 @@ def _get_attack_result_label_condition(self, *, labels: dict[str, str]) -> Any:
 
     def _get_scenario_result_label_condition(self, *, labels: dict[str, str]) -> Any:
         """
-        SQL Azure implementation for filtering ScenarioResults by labels.
+        Get the SQL Azure implementation for filtering ScenarioResults by labels.
 
         Uses JSON_VALUE() function specific to SQL Azure.
 
@@ -385,7 +406,7 @@ def _get_scenario_result_label_condition(self, *, labels: dict[str, str]) -> Any
 
     def _get_scenario_result_target_endpoint_condition(self, *, endpoint: str) -> Any:
         """
-        SQL Azure implementation for filtering ScenarioResults by target endpoint.
+        Get the SQL Azure implementation for filtering ScenarioResults by target endpoint.
 
         Uses JSON_VALUE() function specific to SQL Azure.
 
@@ -402,7 +423,7 @@ def _get_scenario_result_target_endpoint_condition(self, *, endpoint: str) -> An
 
     def _get_scenario_result_target_model_condition(self, *, model_name: str) -> Any:
         """
-        SQL Azure implementation for filtering ScenarioResults by target model name.
+        Get the SQL Azure implementation for filtering ScenarioResults by target model name.
 
         Uses JSON_VALUE() function specific to SQL Azure.
 
@@ -419,7 +440,7 @@ def _get_scenario_result_target_model_condition(self, *, model_name: str) -> Any
 
     def add_message_pieces_to_memory(self, *, message_pieces: Sequence[MessagePiece]) -> None:
         """
-        Inserts a list of message pieces into the memory storage.
+        Insert a list of message pieces into the memory storage.
 
         """
         self._insert_entries(entries=[PromptMemoryEntry(entry=piece) for piece in message_pieces])
@@ -434,17 +455,23 @@ def dispose_engine(self):
 
     def get_all_embeddings(self) -> Sequence[EmbeddingDataEntry]:
         """
-        Fetches all entries from the specified table and returns them as model instances.
+        Fetch all entries from the specified table and returns them as model instances.
+
+        Returns:
+            Sequence[EmbeddingDataEntry]: A sequence of EmbeddingDataEntry instances representing all stored embeddings.
         """
         result: Sequence[EmbeddingDataEntry] = self._query_entries(EmbeddingDataEntry)
         return result
 
     def _insert_entry(self, entry: Base) -> None:  # type: ignore
         """
-        Inserts an entry into the Table.
+        Insert an entry into the Table.
 
         Args:
             entry: An instance of a SQLAlchemy model to be added to the Table.
+
+        Raises:
+            SQLAlchemyError: If the insertion fails.
         """
         with closing(self.get_session()) as session:
             try:
@@ -459,7 +486,15 @@ def _insert_entry(self, entry: Base) -> None:  # type: ignore
     # common between SQLAlchemy-based implementations, regardless of engine.
     # Perhaps we should find a way to refactor
     def _insert_entries(self, *, entries: Sequence[Base]) -> None:  # type: ignore
-        """Inserts multiple entries into the database."""
+        """
+        Insert multiple entries into the database.
+        
+        Args:
+            entries (Sequence[Base]): A sequence of SQLAlchemy model instances to insert.
+
+        Raises:
+            SQLAlchemyError: If the insertion fails.
+        """
         with closing(self.get_session()) as session:
             try:
                 session.add_all(entries)
@@ -471,7 +506,10 @@ def _insert_entries(self, *, entries: Sequence[Base]) -> None:  # type: ignore
 
     def get_session(self) -> Session:
         """
-        Provides a session for database operations.
+        Provide a session for database operations.
+
+        Returns:
+            Session: A new SQLAlchemy session bound to the configured engine.
         """
         return self.SessionFactory()
 
@@ -484,15 +522,19 @@ def _query_entries(
         join_scores: bool = False,
     ) -> MutableSequence[Model]:
         """
-        Fetches data from the specified table model with optional conditions.
+        Fetch data from the specified table model with optional conditions.
 
         Args:
+            Model: The SQLAlchemy model class to query.
             conditions: SQLAlchemy filter conditions (Optional).
             distinct: Flag to return distinct rows (defaults to False).
             join_scores: Flag to join the scores table with entries (defaults to False).
 
         Returns:
             List of model instances representing the rows fetched from the table.
+
+        Raises:
+            SQLAlchemyError: If the query fails.
         """
         with closing(self.get_session()) as session:
             try:
@@ -515,14 +557,18 @@ def _query_entries(
 
     def _update_entries(self, *, entries: MutableSequence[Base], update_fields: dict) -> bool:  # type: ignore
         """
-        Updates the given entries with the specified field values.
+        Update the given entries with the specified field values.
 
         Args:
             entries (Sequence[Base]): A list of SQLAlchemy model instances to be updated.
             update_fields (dict): A dictionary of field names and their new values.
 
         Returns:
             bool: True if the update was successful, False otherwise.
+
+        Raises:
+            ValueError: If 'update_fields' is empty.
+            SQLAlchemyError: If the update fails.
         """
         if not update_fields:
             raise ValueError("update_fields must be provided to update prompt entries.")
diff --git a/pyrit/memory/central_memory.py b/pyrit/memory/central_memory.py
@@ -10,8 +10,8 @@
 
 class CentralMemory:
     """
-    Provides a centralized memory instance across the framework. The provided memory
-    instance will be reused for future calls.
+    Provide a centralized memory instance across the framework. 
+    The provided memory instance will be reused for future calls.
     """
 
     _memory_instance: MemoryInterface = None
@@ -30,7 +30,13 @@ def set_memory_instance(cls, passed_memory: MemoryInterface) -> None:
     @classmethod
     def get_memory_instance(cls) -> MemoryInterface:
         """
-        Returns a centralized memory instance.
+        Return a centralized memory instance.
+
+        Returns:
+            MemoryInterface: The singleton memory instance.
+
+        Raises:
+            ValueError: If the central memory instance has not been set.
         """
         if cls._memory_instance:
             logger.info(f"Using existing memory instance: {type(cls._memory_instance).__name__}")
diff --git a/pyrit/memory/memory_embedding.py b/pyrit/memory/memory_embedding.py
@@ -18,19 +18,32 @@ class MemoryEmbedding:
     """
 
     def __init__(self, *, embedding_model: Optional[EmbeddingSupport] = None):
+        """
+        Initialize the memory embedding helper with a backing embedding model.
+
+        Args:
+            embedding_model (Optional[EmbeddingSupport]): The embedding model used to
+                generate text embeddings. If not provided, a ValueError is raised.
+
+        Raises:
+            ValueError: If `embedding_model` is not provided.
+        """
         if embedding_model is None:
             raise ValueError("embedding_model must be set.")
         self.embedding_model = embedding_model
 
     def generate_embedding_memory_data(self, *, message_piece: MessagePiece) -> EmbeddingDataEntry:
         """
-        Generates metadata for a message piece.
+        Generate metadata for a message piece.
 
         Args:
             message_piece (MessagePiece): the message piece for which to generate a text embedding
 
         Returns:
             EmbeddingDataEntry: The generated metadata.
+
+        Raises:
+            ValueError: If the message piece is not of type text.
         """
         if message_piece.converted_value_data_type == "text":
             embedding_data = EmbeddingDataEntry(
@@ -46,6 +59,24 @@ def generate_embedding_memory_data(self, *, message_piece: MessagePiece) -> Embe
 
 
 def default_memory_embedding_factory(embedding_model: Optional[EmbeddingSupport] = None) -> MemoryEmbedding | None:
+    """
+    Create a MemoryEmbedding instance with default or provided embedding model.
+
+    Factory function that creates a MemoryEmbedding instance. If an embedding_model
+    is provided, it uses that model. Otherwise, it attempts to create an Azure
+    OpenAI embedding model from environment variables.
+
+    Args:
+        embedding_model: Optional embedding model to use. If not provided,
+            attempts to create AzureTextEmbedding from environment variables.
+
+    Returns:
+        MemoryEmbedding: Configured memory embedding instance.
+
+    Raises:
+        ValueError: If no embedding model is provided and required Azure
+            OpenAI environment variables are not set.
+    """
     if embedding_model:
         return MemoryEmbedding(embedding_model=embedding_model)
 
diff --git a/pyrit/memory/memory_exporter.py b/pyrit/memory/memory_exporter.py
@@ -16,6 +16,11 @@ class MemoryExporter:
     """
 
     def __init__(self):
+        """
+        Initialize the MemoryExporter.
+
+        Sets up the available export formats using the strategy design pattern.
+        """
         # Using strategy design pattern for export functionality.
         self.export_strategies = {
             "json": self.export_to_json,
@@ -28,7 +33,7 @@ def export_data(
         self, data: list[MessagePiece], *, file_path: Optional[Path] = None, export_type: str = "json"
     ):  # type: ignore
         """
-        Exports the provided data to a file in the specified format.
+        Export the provided data to a file in the specified format.
 
         Args:
             data (list[MessagePiece]): The data to be exported, as a list of MessagePiece instances.
@@ -49,7 +54,7 @@ def export_data(
 
     def export_to_json(self, data: list[MessagePiece], file_path: Path = None) -> None:  # type: ignore
         """
-        Exports the provided data to a JSON file at the specified file path.
+        Export the provided data to a JSON file at the specified file path.
         Each item in the data list, representing a row from the table,
         is converted to a dictionary before being written to the file.
 
@@ -72,7 +77,7 @@ def export_to_json(self, data: list[MessagePiece], file_path: Path = None) -> No
 
     def export_to_csv(self, data: list[MessagePiece], file_path: Path = None) -> None:  # type: ignore
         """
-        Exports the provided data to a CSV file at the specified file path.
+        Export the provided data to a CSV file at the specified file path.
         Each item in the data list, representing a row from the table,
         is converted to a dictionary before being written to the file.
 
@@ -98,7 +103,7 @@ def export_to_csv(self, data: list[MessagePiece], file_path: Path = None) -> Non
 
     def export_to_markdown(self, data: list[MessagePiece], file_path: Path = None) -> None:  # type: ignore
         """
-        Exports the provided data to a Markdown file at the specified file path.
+        Export the provided data to a Markdown file at the specified file path.
         Each item in the data list is converted to a dictionary and formatted as a table.
 
         Args:
diff --git a/pyrit/memory/memory_interface.py b/pyrit/memory/memory_interface.py
diff --git a/pyrit/memory/memory_models.py b/pyrit/memory/memory_models.py
diff --git a/pyrit/memory/sqlite_memory.py b/pyrit/memory/sqlite_memory.py