reimport: optimize vulnerability_id processing #13891

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed

valentijnscholten wants to merge 4 commits into DefectDojo:bugfix from valentijnscholten:reimport-vuln-id-optimize

dojo/finding/deduplication.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -239,7 +239,7 @@ def build_dedupe_scope_queryset(test): @@
         return (
             Finding.objects.filter(scope_q)
             .select_related("test", "test__engagement", "test__test_type")
-            .prefetch_related("endpoints")
+            .prefetch_related("endpoints", "vulnerability_id_set")
         )
@@ Expand Down @@

dojo/finding/helper.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -762,12 +762,15 @@ def add_endpoints(new_finding, form): @@
                 endpoint=endpoint, defaults={"date": form.cleaned_data["date"] or timezone.now()})
-    def save_vulnerability_ids(finding, vulnerability_ids):
+    def save_vulnerability_ids(finding, vulnerability_ids, *, delete_existing: bool = True):
         # Remove duplicates
         vulnerability_ids = list(dict.fromkeys(vulnerability_ids))
-        # Remove old vulnerability ids
-        Vulnerability_Id.objects.filter(finding=finding).delete()
+        # Remove old vulnerability ids if requested
+        # Callers can set delete_existing=False when they know there are no existing IDs
+        # to avoid an unnecessary delete query (e.g., for new findings)
+        if delete_existing:
+            Vulnerability_Id.objects.filter(finding=finding).delete()
         # Save new vulnerability ids
         # Using bulk create throws Django 50 warnings about unsaved models...
@@ Expand Down @@

dojo/importers/base_importer.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -31,7 +31,6 @@
  
        Test_Import,

        Test_Import_Finding_Action,

        Test_Type,

        Vulnerability_Id,

    )

    from dojo.notifications.helper import create_notification

    from dojo.tag_utils import bulk_add_tags_to_instances

    @@ -787,21 +786,23 @@ def process_cve(
  
            return finding

        def process_vulnerability_ids(

        def store_vulnerability_ids(

            self,

            finding: Finding,

        ) -> Finding:

            """

            Parse the `unsaved_vulnerability_ids` field from findings after they are parsed

            to create `Vulnerability_Id` objects with the finding associated correctly

            """

            if finding.unsaved_vulnerability_ids:

                # Remove old vulnerability ids - keeping this call only because of flake8

                Vulnerability_Id.objects.filter(finding=finding).delete()

            Store vulnerability IDs for a finding.

            Reads from finding.unsaved_vulnerability_ids and saves them overwriting existing ones.

            Args:

                finding: The finding to store vulnerability IDs for

                # user the helper function

                finding_helper.save_vulnerability_ids(finding, finding.unsaved_vulnerability_ids)

            Returns:

                The finding object

            """

            vulnerability_ids_to_process = finding.unsaved_vulnerability_ids or []

            finding_helper.save_vulnerability_ids(finding, vulnerability_ids_to_process, delete_existing=False)

            return finding

        def process_files(

dojo/importers/default_importer.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -234,7 +234,7 @@ def process_findings( @@
                 # Process any files
                 self.process_files(finding)
                 # Process vulnerability IDs
-                finding = self.process_vulnerability_ids(finding)
+                finding = self.store_vulnerability_ids(finding)
                 # Categorize this finding as a new one
                 new_findings.append(finding)
                 # all data is already saved on the finding, we only need to trigger post processing in batches
@@ Expand Down @@

dojo/importers/default_reimporter.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -175,6 +175,9 @@ def process_findings( @@
             else:
                 original_findings = self.test.finding_set.all().filter(Q(service__isnull=True) | Q(service__exact=""))
+            # Prefetch vulnerability_id_set for reconcile_vulnerability_ids
+            original_findings = original_findings.prefetch_related("vulnerability_id_set")
             logger.debug(f"original_findings_qyer: {original_findings.query}")
             self.original_items = list(original_findings)
             logger.debug(f"original_items: {[(item.id, item.hash_code) for item in self.original_items]}")
@@ Expand Down Expand Up / @@ -247,14 +250,19 @@ def process_findings( @@
                             unsaved_finding,
                             self.user,
                         )
+                    # Existing finding - use reconcile_vulnerability_ids
+                    is_new_finding = False
                 else:
                     finding = self.process_finding_that_was_not_matched(unsaved_finding)
+                    # New finding - use store_vulnerability_ids
+                    is_new_finding = True
                 # This condition __appears__ to always be true, but am afraid to remove it
                 if finding:
                     # Process the rest of the items on the finding
                     finding = self.finding_post_processing(
                         finding,
                         unsaved_finding,
+                        is_new_finding=is_new_finding,
                     )
                     # all data is already saved on the finding, we only need to trigger post processing in batches
                     push_to_jira = self.push_to_jira and (not self.findings_groups_enabled or not self.group_by)
@@ Expand Down Expand Up / @@ -691,14 +699,57 @@ def process_finding_that_was_not_matched( @@
             self.process_request_response_pairs(unsaved_finding)
             return unsaved_finding
+        def reconcile_vulnerability_ids(
+            self,
+            finding: Finding,
+        ) -> Finding:
+            """
+            Reconcile vulnerability IDs for an existing finding.
+            Checks if IDs have changed before updating to avoid unnecessary database operations.
+            Uses prefetched data if available, otherwise fetches efficiently.
+            Args:
+                finding: The existing finding to reconcile vulnerability IDs for.
+                    Must have unsaved_vulnerability_ids set.
+            Returns:
+                The finding object
+            """
+            vulnerability_ids_to_process = finding.unsaved_vulnerability_ids or []
+            # Use prefetched data directly without triggering queries
+            existing_vuln_ids = {v.vulnerability_id for v in finding.vulnerability_id_set.all()}
+            new_vuln_ids = set(vulnerability_ids_to_process)
+            # Early exit if unchanged
+            if existing_vuln_ids == new_vuln_ids:
+                logger.debug(
+                    f"Skipping vulnerability_ids update for finding {finding.id} - "
+                    f"vulnerability_ids unchanged: {sorted(existing_vuln_ids)}",
+                )
+                return finding
+            # Update if changed
+            finding_helper.save_vulnerability_ids(finding, vulnerability_ids_to_process, delete_existing=True)
+            return finding
         def finding_post_processing(
             self,
             finding: Finding,
             finding_from_report: Finding,
-        ) -> None:
+            *,
+            is_new_finding: bool = False,
+        ) -> Finding:
             """
             Save all associated objects to the finding after it has been saved
             for the purpose of foreign key restrictions
+            Args:
+                finding: The finding to process (can be new or existing)
+                finding_from_report: The finding parsed from the report
+                is_new_finding: True if this is a newly created finding, False if it's an existing finding
             """
             self.endpoint_manager.chunk_endpoints_and_disperse(finding, finding_from_report.unsaved_endpoints)
             if len(self.endpoints_to_add) > 0:
@@ Expand All / @@ -715,10 +766,19 @@ def finding_post_processing( @@
                 finding.unsaved_files = finding_from_report.unsaved_files
             self.process_files(finding)
             # Process vulnerability IDs
-            if finding_from_report.unsaved_vulnerability_ids:
-                finding.unsaved_vulnerability_ids = finding_from_report.unsaved_vulnerability_ids
+            # Copy unsaved_vulnerability_ids from the report finding to the finding
+            # Always set it (even if empty list) so we can clear existing IDs when report has none
+            finding.unsaved_vulnerability_ids = finding_from_report.unsaved_vulnerability_ids or []
+            # Store the current cve value to check if it changes
+            old_cve = finding.cve
             # legacy cve field has already been processed/set earlier
-            return self.process_vulnerability_ids(finding)
+            # Use store_vulnerability_ids for new findings, reconcile_vulnerability_ids for existing findings
+            finding = self.store_vulnerability_ids(finding) if is_new_finding else self.reconcile_vulnerability_ids(finding)
+            # Save the finding only if the cve field was changed by save_vulnerability_ids
+            # This is temporary as the cve field will be phased out
+            if finding.cve != old_cve:
+                finding.save()
+            return finding
         def process_groups_for_all_findings(
             self,
@@ Expand Down @@

unittests/scans/anchore_grype/check_all_fields_different_ids_fabricated.json

-Original file line number
+Diff line change
@@ -0,0 +1,167 @@
+    {
+        "matches": [
+         {
+          "vulnerability": {
+           "id": "GHSA-v6rh-hp5x-86rv",
+           "dataSource": "https://github.com/advisories/GHSA-v6rh-hp5x-86rv",
+           "namespace": "github:python",
+           "severity": "High",
+           "urls": [
+            "https://github.com/advisories/GHSA-v6rh-hp5x-86rv"
+           ],
+           "description": "Potential bypass of an upstream access control based on URL paths in Django",
+           "cvss": [],
+           "fix": {
+            "versions": [
+             "3.2.10"
+            ],
+            "state": "fixed"
+           },
+           "advisories": []
+          },
+          "relatedVulnerabilities": [
+           {
+            "id": "CVE-2021-1234",
+            "dataSource": "https://nvd.nist.gov/vuln/detail/CVE-2021-1234",
+            "namespace": "nvd",
+            "severity": "High",
+            "urls": [
+             "https://example.com/cve-2021-1234"
+            ],
+            "description": "A different CVE for testing vulnerability ID changes",
+            "cvss": [
+             {
+              "version": "3.1",
+              "vector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H",
+              "metrics": {
+               "baseScore": 9.8,
+               "exploitabilityScore": 3.9,
+               "impactScore": 5.9
+              },
+              "vendorMetadata": {}
+             }
+            ]
+           },
+           {
+            "id": "CVE-2021-5678",
+            "dataSource": "https://nvd.nist.gov/vuln/detail/CVE-2021-5678",
+            "namespace": "nvd",
+            "severity": "Medium",
+            "urls": [
+             "https://example.com/cve-2021-5678"
+            ],
+            "description": "Another different CVE for testing vulnerability ID changes",
+            "cvss": [
+             {
+              "version": "3.1",
+              "vector": "CVSS:3.1/AV:N/AC:L/PR:L/UI:N/S:U/C:L/I:L/A:L",
+              "metrics": {
+               "baseScore": 6.3,
+               "exploitabilityScore": 3.9,
+               "impactScore": 2.4
+              },
+              "vendorMetadata": {}
+             }
+            ]
+           }
+          ],
+          "matchDetails": [
+           {
+            "matcher": "python-matcher",
+            "searchedBy": {
+             "language": "python",
+             "namespace": "github:python"
+            },
+            "found": {
+             "versionConstraint": ">=3.2,<3.2.10 (python)"
+            }
+           }
+          ],
+          "artifact": {
+           "name": "Django",
+           "version": "3.2.9",
+           "type": "python",
+           "locations": [
+            {
+             "path": "/usr/local/lib/python3.8/site-packages/Django-3.2.9.dist-info/METADATA",
+             "layerID": "sha256:b1d4455cf82b15a50b006fe87bd29f694c8f9155456253eb67fdd155b5edcf4a"
+            }
+           ],
+           "language": "python",
+           "licenses": [
+            "BSD-3-Clause"
+           ],
+           "cpes": [
+            "cpe:2.3:a:django_software_foundation:Django:3.2.9:*:*:*:*:*:*:*"
+           ],
+           "purl": "pkg:pypi/Django@3.2.9",
+           "metadata": null
+          }
+         }
+        ],
+        "source": {
+         "type": "image",
+         "target": {
+          "userInput": "vulnerable-image:latest",
+          "imageID": "sha256:ce9898fd214aef9c994a42624b09056bdce3ff4a8e3f68dc242d967b80fcbeee",
+          "manifestDigest": "sha256:9d8825ab20ac86b40eb71495bece1608a302fb180384740697a28c2b0a5a0fc6",
+          "mediaType": "application/vnd.docker.distribution.manifest.v2+json",
+          "tags": [
+           "vulnerable-image:latest"
+          ],
+          "imageSize": 707381791,
+          "layers": []
+         }
+        },
+        "distro": {
+         "name": "debian",
+         "version": "10",
+         "idLike": ""
+        },
+        "descriptor": {
+         "name": "grype",
+         "version": "0.28.0",
+         "configuration": {
+          "configPath": "",
+          "output": "json",
+          "file": "",
+          "output-template-file": "",
+          "quiet": false,
+          "check-for-app-update": true,
+          "only-fixed": false,
+          "scope": "Squashed",
+          "log": {
+           "structured": false,
+           "level": "",
+           "file": ""
+          },
+          "db": {
+           "cache-dir": "/home/user/.cache/grype/db",
+           "update-url": "https://toolbox-data.anchore.io/grype/databases/listing.json",
+           "ca-cert": "",
+           "auto-update": true,
+           "validate-by-hash-on-start": false
+          },
+          "dev": {
+           "profile-cpu": false,
+           "profile-mem": false
+          },
+          "fail-on-severity": "",
+          "registry": {
+           "insecure-skip-tls-verify": false,
+           "insecure-use-http": false,
+           "auth": []
+          },
+          "ignore": null,
+          "exclude": []
+         },
+         "db": {
+          "built": "2021-12-24T08:14:02Z",
+          "schemaVersion": 3,
+          "location": "/home/user/.cache/grype/db/3",
+          "checksum": "sha256:6c4777e1acea787e5335ccee6b5e4562cd1767b9cca138c07e0802efb2a74162",
+          "error": null
+         }
+        }
+       }

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

reimport: optimize vulnerability_id processing #13891

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!