From d32806b54082b0c5f6c222e9aae360a1a37c08f3 Mon Sep 17 00:00:00 2001 From: Martin Torp Date: Wed, 10 Jun 2026 11:14:57 +0200 Subject: [PATCH 1/4] Retry transient full-scan upload failures (502/503/504/408, dropped connections) A full-scan upload can fail transiently at the gateway/connection level - an HTTP 502/503/504/408, a dropped or reset connection, or a client-side timeout - without the server having created the scan. The CLI previously made exactly one attempt, so an entire run (including a completed reachability analysis) died on a single transient failure even though a retried upload almost always succeeds. create_full_scan now retries the fullscans POST up to 3 total attempts with increasing waits (~10s, then ~30s, plus jitter) on transient failures only: APIBadGateway (502), APIConnectionError, APITimeout, and catch-all APIFailure whose embedded original_status_code is 408/503/504. Dedicated 4xx classes, catch-all 400s, and error payloads are never retried. In these failure modes the server never finished reading the request body, so no scan was created and a retry does not duplicate one; in the rare case where a gateway timeout races a request the server later completes, the extra scan is benign and superseded by the retry (as if the CLI had run twice). The retry loop lives inside the existing try/finally so the brotli-compressed .socket.facts.json.br temp files survive until every attempt has finished; fullscans.post rebuilds its lazy file loaders from the plain paths on every call, so re-invoking it per attempt is safe. Assisted-by: Claude Code:claude-opus-4-8 --- CHANGELOG.md | 16 ++ pyproject.toml | 2 +- socketsecurity/__init__.py | 2 +- socketsecurity/core/__init__.py | 79 ++++++++- tests/unit/test_full_scan_retry.py | 265 +++++++++++++++++++++++++++++ uv.lock | 2 +- 6 files changed, 361 insertions(+), 5 deletions(-) create mode 100644 tests/unit/test_full_scan_retry.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 060d486..1ea6b3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,21 @@ # Changelog +## 2.4.8 + +### Fixed: retry transient full-scan upload failures + +- The full-scan upload (`POST /orgs//full-scans`) now retries transient + gateway/connection failures — HTTP 502/503/504/408, dropped or reset connections, and + request timeouts — up to 3 total attempts with increasing waits (~10s, then ~30s, plus + jitter). Such failures are intermittent and a retried upload almost always succeeds. + In these failure modes the server never finished reading the request body, so no scan + was created and a retry does not duplicate one; in the rare case where a gateway + timeout races a request the server later + completes, the extra scan is benign and superseded by the retried one (as if the CLI had + run twice). + Non-transient errors (400/401/403/404/429 and error payloads) are never retried. Each + retry logs a warning explaining what failed and when the next attempt happens. + ## 2.4.7 ### Changed: pin @coana-tech/cli version; auto-update is now opt-in diff --git a/pyproject.toml b/pyproject.toml index 6bed831..18c6332 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "hatchling.build" [project] name = "socketsecurity" -version = "2.4.7" +version = "2.4.8" requires-python = ">= 3.11" license = {"file" = "LICENSE"} dependencies = [ diff --git a/socketsecurity/__init__.py b/socketsecurity/__init__.py index 91c7faf..9f1797f 100644 --- a/socketsecurity/__init__.py +++ b/socketsecurity/__init__.py @@ -1,3 +1,3 @@ __author__ = 'socket.dev' -__version__ = '2.4.7' +__version__ = '2.4.8' USER_AGENT = f'SocketPythonCLI/{__version__}' diff --git a/socketsecurity/core/__init__.py b/socketsecurity/core/__init__.py index 20ff28f..045d8d5 100644 --- a/socketsecurity/core/__init__.py +++ b/socketsecurity/core/__init__.py @@ -1,5 +1,6 @@ import logging import os +import random import re import sys import tarfile @@ -13,7 +14,12 @@ if TYPE_CHECKING: from socketsecurity.config import CliConfig from socketdev import socketdev -from socketdev.exceptions import APIFailure +from socketdev.exceptions import ( + APIBadGateway, + APIConnectionError, + APIFailure, + APITimeout, +) from socketdev.fullscans import FullScanParams, SocketArtifact from socketdev.org import Organization from socketdev.repos import RepositoryInfo @@ -76,6 +82,48 @@ TIER1_FINALIZE_MAX_ATTEMPTS = 3 TIER1_FINALIZE_BACKOFF_SECONDS = 1.0 +# Full scan upload retry policy. An upload can fail transiently at the gateway/connection +# level (an HTTP 502/503/504/408, a dropped or reset connection, or a client-side timeout) +# without the server having created the scan; a retried upload almost always succeeds. +# In these failure modes no scan was created, so a retry does not duplicate one. (A +# duplicate is possible only if a gateway timeout races a request the server later +# completes; that is benign - the retried scan supersedes the orphaned one, same as +# running the CLI twice.) +FULL_SCAN_UPLOAD_MAX_ATTEMPTS = 3 +# Wait before retry attempt 2 and attempt 3 respectively (plus a little jitter so a fleet of +# CI jobs hitting the same failure doesn't retry in lock-step). +FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0) +FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS = 2.0 +# Transient gateway/timeout HTTP statuses that the SDK does NOT raise as a dedicated +# exception class (502 has APIBadGateway; 408/503/504 surface as the catch-all APIFailure +# with the status only present in the message text - see _is_transient_full_scan_upload_error). +FULL_SCAN_UPLOAD_RETRYABLE_STATUS_CODES = frozenset({408, 503, 504}) +# Matches the status code the SDK embeds in catch-all APIFailure messages +# (socketdev/core/api.py: "Bad Request: HTTP original_status_code: ..."). +_API_FAILURE_STATUS_CODE_RE = re.compile(r"original_status_code:(\d{3})") + + +def _is_transient_full_scan_upload_error(error: Exception) -> bool: + """Whether a full-scan upload failure is transient and safe to retry. + + Transient means the failure happened at the gateway/connection level, normally before the + server finished reading the request body (so no scan was created server-side): HTTP + 502/503/504/408, client-side timeouts, and dropped/reset connections. 4xx client errors + (400/401/403/404/429) and success responses carrying an error payload are never retried. + """ + if isinstance(error, (APIBadGateway, APIConnectionError, APITimeout)): + # 502 / connection reset-dropped / request timeout - the SDK raises dedicated classes. + return True + if type(error) is APIFailure: + # The SDK raises 408/503/504 (and every other status without a dedicated class, + # including 400) as the catch-all APIFailure, so match on the exact class plus the + # status code embedded in the message. Subclasses (APIAccessDenied, APIResourceNotFound, + # APIInsufficientQuota, ...) are deliberately excluded - those are never transient. + match = _API_FAILURE_STATUS_CODE_RE.search(str(error)) + if match: + return int(match.group(1)) in FULL_SCAN_UPLOAD_RETRYABLE_STATUS_CODES + return False + def _humanize_alert_type(alert_type: str) -> str: """Convert a camelCase/PascalCase alert type into a Title-Cased label. @@ -787,7 +835,34 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths: # facts file under the per-file upload size cap. See _compress_facts_files_for_upload. upload_files, compressed_temp_files = self._compress_facts_files_for_upload(files) try: - res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths) + # Retry transient gateway/timeout failures (502/503/504/408, dropped connections, + # timeouts) with increasing waits; such failures are intermittent and a retried + # upload almost always succeeds. In these failure modes the server never finished + # reading the request body, so no scan was created and a retry does not duplicate + # one (see the retry-policy comment above FULL_SCAN_UPLOAD_MAX_ATTEMPTS). fullscans.post() + # rebuilds its lazy file loaders from the plain paths in upload_files on every call, + # so simply calling it again per attempt is safe. The loop must stay inside this try + # so the temp .br files (cleaned up in the finally below) outlive every attempt. + for attempt in range(1, FULL_SCAN_UPLOAD_MAX_ATTEMPTS + 1): + try: + res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths) + break + except APIFailure as error: + if attempt >= FULL_SCAN_UPLOAD_MAX_ATTEMPTS or not _is_transient_full_scan_upload_error(error): + raise + backoff_index = min(attempt, len(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS)) - 1 + wait_seconds = FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS[backoff_index] + random.uniform( + 0, FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS + ) + # SDK error messages can span many lines (path + response headers); the + # first line carries the status, which is all the warning needs. + error_summary = str(error).strip().splitlines()[0] if str(error).strip() else "" + log.warning( + f"Full scan upload failed with {type(error).__name__}({error_summary}), " + f"retrying in {wait_seconds:.0f}s " + f"(attempt {attempt + 1}/{FULL_SCAN_UPLOAD_MAX_ATTEMPTS})" + ) + time.sleep(wait_seconds) finally: for temp_file in compressed_temp_files: try: diff --git a/tests/unit/test_full_scan_retry.py b/tests/unit/test_full_scan_retry.py new file mode 100644 index 0000000..16a6fb3 --- /dev/null +++ b/tests/unit/test_full_scan_retry.py @@ -0,0 +1,265 @@ +"""Tests for the full-scan upload retry on transient gateway/connection failures. + +A `POST /orgs//full-scans` upload can fail transiently (an HTTP 502/503/504/408, a +dropped or reset connection, or a timeout) without the server having created the scan. +`Core.create_full_scan` retries those transient failures; these tests cover the retry +classification, the loop bounds, and that the temporary brotli-compressed facts files +survive until every attempt has finished. +""" + +import logging +from unittest.mock import MagicMock + +import pytest +from socketdev.exceptions import ( + APIAccessDenied, + APIBadGateway, + APIConnectionError, + APIFailure, + APIResourceNotFound, + APITimeout, +) +from socketdev.fullscans import FullScanMetadata + +from socketsecurity.core import ( + FULL_SCAN_UPLOAD_MAX_ATTEMPTS, + SOCKET_FACTS_BROTLI_FILENAME, + SOCKET_FACTS_FILENAME, + Core, + _is_transient_full_scan_upload_error, +) + + +def _success_response(): + metadata = FullScanMetadata( + id="scan-1", + created_at="2026-01-01T00:00:00Z", + updated_at="2026-01-01T00:00:00Z", + organization_id="org-1", + repository_id="repo-1", + branch="main", + html_report_url="https://socket.dev/report", + ) + response = MagicMock() + response.success = True + response.data = metadata + return response + + +# Catch-all APIFailure messages as the SDK formats them (socketdev/core/api.py); only the +# embedded original_status_code distinguishes a transient 503/504/408 from e.g. a 400. +def _catch_all_failure(status_code: int) -> APIFailure: + return APIFailure( + f"Bad Request: HTTP original_status_code:{status_code}\n" + f"Path: https://api.socket.dev/v0/orgs/org/full-scans\n\n" + f"Headers:\ncf-ray: abc123" + ) + + +@pytest.fixture +def core_with_mock_sdk(): + # Build a Core without running org setup; we only exercise create_full_scan. + core = Core.__new__(Core) + core.sdk = MagicMock() + core.cli_config = None # skip the tier1 finalize branch + return core + + +@pytest.fixture(autouse=True) +def no_sleep(mocker): + return mocker.patch("socketsecurity.core.time.sleep") + + +def test_upload_succeeds_first_try(core_with_mock_sdk, tmp_path, no_sleep): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.return_value = _success_response() + + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert full_scan.id == "scan-1" + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1 + no_sleep.assert_not_called() + + +def test_upload_retries_on_502_then_succeeds( + core_with_mock_sdk, tmp_path, no_sleep, caplog +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = [ + APIBadGateway(), + APIBadGateway(), + _success_response(), + ] + + with caplog.at_level(logging.WARNING, logger="socketdev"): + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert full_scan.id == "scan-1" + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 3 + assert no_sleep.call_count == 2 # waits before attempts 2 and 3 + retry_warnings = [r for r in caplog.records if "retrying in" in r.message] + assert len(retry_warnings) == 2 + assert "APIBadGateway" in retry_warnings[0].message + assert f"(attempt 2/{FULL_SCAN_UPLOAD_MAX_ATTEMPTS})" in retry_warnings[0].message + + +def test_upload_raises_after_exhausting_attempts( + core_with_mock_sdk, tmp_path, no_sleep +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = APIBadGateway() + + with pytest.raises(APIBadGateway): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert ( + core_with_mock_sdk.sdk.fullscans.post.call_count + == FULL_SCAN_UPLOAD_MAX_ATTEMPTS + ) + + +def test_upload_retries_on_catch_all_503(core_with_mock_sdk, tmp_path, no_sleep): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = [ + _catch_all_failure(503), + _success_response(), + ] + + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert full_scan.id == "scan-1" + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 2 + + +@pytest.mark.parametrize("error_class", [APIConnectionError, APITimeout]) +def test_upload_retries_on_connection_level_errors( + core_with_mock_sdk, tmp_path, no_sleep, error_class +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = [ + error_class(), + _success_response(), + ] + + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert full_scan.id == "scan-1" + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 2 + + +def test_upload_does_not_retry_on_400(core_with_mock_sdk, tmp_path, no_sleep): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = _catch_all_failure(400) + + with pytest.raises(APIFailure): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1 + no_sleep.assert_not_called() + + +@pytest.mark.parametrize("error_class", [APIAccessDenied, APIResourceNotFound]) +def test_upload_does_not_retry_on_dedicated_4xx_classes( + core_with_mock_sdk, tmp_path, no_sleep, error_class +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = error_class() + + with pytest.raises(error_class): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1 + no_sleep.assert_not_called() + + +def test_upload_does_not_retry_on_error_payload(core_with_mock_sdk, tmp_path, no_sleep): + # A response that came back but reports failure (res.success False) is not transient. + manifest = tmp_path / "package.json" + manifest.write_text("{}") + failed = MagicMock() + failed.success = False + failed.message = "tarball too large" + failed.status = 200 + core_with_mock_sdk.sdk.fullscans.post.return_value = failed + + with pytest.raises(Exception, match="tarball too large"): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1 + no_sleep.assert_not_called() + + +def test_temp_br_file_survives_retries_and_is_cleaned_after( + core_with_mock_sdk, tmp_path, no_sleep +): + # The brotli-compressed facts sibling must stay on disk across every retry attempt + # (the SDK re-reads it per call) and only be deleted once all attempts finished. + facts = tmp_path / SOCKET_FACTS_FILENAME + facts.write_text('{"components": []}') + compressed = tmp_path / SOCKET_FACTS_BROTLI_FILENAME + br_present_per_attempt = [] + + def post_side_effect(upload_files, *args, **kwargs): + br_present_per_attempt.append(compressed.is_file()) + assert str(compressed) in upload_files + if len(br_present_per_attempt) < 3: + raise APIBadGateway() + return _success_response() + + core_with_mock_sdk.sdk.fullscans.post.side_effect = post_side_effect + + full_scan = core_with_mock_sdk.create_full_scan([str(facts)], MagicMock()) + + assert full_scan.id == "scan-1" + assert br_present_per_attempt == [True, True, True] + assert not compressed.exists() # cleaned up after the attempts finished + assert facts.is_file() # the original facts file is never touched + + +def test_temp_br_file_cleaned_after_exhausted_retries( + core_with_mock_sdk, tmp_path, no_sleep +): + facts = tmp_path / SOCKET_FACTS_FILENAME + facts.write_text('{"components": []}') + compressed = tmp_path / SOCKET_FACTS_BROTLI_FILENAME + core_with_mock_sdk.sdk.fullscans.post.side_effect = APIBadGateway() + + with pytest.raises(APIBadGateway): + core_with_mock_sdk.create_full_scan([str(facts)], MagicMock()) + + assert ( + core_with_mock_sdk.sdk.fullscans.post.call_count + == FULL_SCAN_UPLOAD_MAX_ATTEMPTS + ) + assert not compressed.exists() + + +def test_transient_classifier(): + assert _is_transient_full_scan_upload_error(APIBadGateway()) + assert _is_transient_full_scan_upload_error(APIConnectionError()) + assert _is_transient_full_scan_upload_error(APITimeout()) + assert _is_transient_full_scan_upload_error(_catch_all_failure(408)) + assert _is_transient_full_scan_upload_error(_catch_all_failure(503)) + assert _is_transient_full_scan_upload_error(_catch_all_failure(504)) + + assert not _is_transient_full_scan_upload_error(_catch_all_failure(400)) + assert not _is_transient_full_scan_upload_error(_catch_all_failure(500)) + assert not _is_transient_full_scan_upload_error( + APIFailure() + ) # wrapped unexpected error + assert not _is_transient_full_scan_upload_error(APIAccessDenied("denied")) + assert not _is_transient_full_scan_upload_error(APIResourceNotFound()) + # Subclasses never match the catch-all branch, even with a retryable-looking message. + assert not _is_transient_full_scan_upload_error( + APIAccessDenied("original_status_code:503") + ) + assert not _is_transient_full_scan_upload_error( + ValueError("original_status_code:503") + ) diff --git a/uv.lock b/uv.lock index 29cd641..5f6ffdf 100644 --- a/uv.lock +++ b/uv.lock @@ -1283,7 +1283,7 @@ wheels = [ [[package]] name = "socketsecurity" -version = "2.4.7" +version = "2.4.8" source = { editable = "." } dependencies = [ { name = "brotli", marker = "platform_python_implementation == 'CPython'" }, From 8dc097002b5126a85d608fa24492be17101773e8 Mon Sep 17 00:00:00 2001 From: Martin Torp Date: Wed, 10 Jun 2026 11:42:33 +0200 Subject: [PATCH 2/4] docs: drop the 'retry almost always succeeds' claim from retry comments --- socketsecurity/core/__init__.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/socketsecurity/core/__init__.py b/socketsecurity/core/__init__.py index 045d8d5..dbf30f7 100644 --- a/socketsecurity/core/__init__.py +++ b/socketsecurity/core/__init__.py @@ -84,11 +84,10 @@ # Full scan upload retry policy. An upload can fail transiently at the gateway/connection # level (an HTTP 502/503/504/408, a dropped or reset connection, or a client-side timeout) -# without the server having created the scan; a retried upload almost always succeeds. -# In these failure modes no scan was created, so a retry does not duplicate one. (A -# duplicate is possible only if a gateway timeout races a request the server later -# completes; that is benign - the retried scan supersedes the orphaned one, same as -# running the CLI twice.) +# without the server having created the scan. In these failure modes no scan was created, +# so a retry does not duplicate one. (A duplicate is possible only if a gateway timeout +# races a request the server later completes; that is benign - the retried scan supersedes +# the orphaned one, same as running the CLI twice.) FULL_SCAN_UPLOAD_MAX_ATTEMPTS = 3 # Wait before retry attempt 2 and attempt 3 respectively (plus a little jitter so a fleet of # CI jobs hitting the same failure doesn't retry in lock-step). @@ -836,8 +835,7 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths: upload_files, compressed_temp_files = self._compress_facts_files_for_upload(files) try: # Retry transient gateway/timeout failures (502/503/504/408, dropped connections, - # timeouts) with increasing waits; such failures are intermittent and a retried - # upload almost always succeeds. In these failure modes the server never finished + # timeouts) with increasing waits. In these failure modes the server never finished # reading the request body, so no scan was created and a retry does not duplicate # one (see the retry-policy comment above FULL_SCAN_UPLOAD_MAX_ATTEMPTS). fullscans.post() # rebuilds its lazy file loaders from the plain paths in upload_files on every call, From 487557abc917b147489942d703cb36b121ce7f60 Mon Sep 17 00:00:00 2001 From: Martin Torp Date: Wed, 10 Jun 2026 13:26:04 +0200 Subject: [PATCH 3/4] Move transient-error classification into the SDK; simplify retry loop Address review feedback on the upload retry: - The retry decision now delegates to APIFailure.is_transient_error() (socketdev>=3.3.0, SocketDev/socket-sdk-python#93), which classifies by the HTTP status code the SDK records when raising. The CLI no longer encodes the SDK's exception hierarchy or parses status codes out of message text, so SDK restructuring can't silently break the classification. - The backoff schedule is now the single source of truth for the loop: FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0, None), where each entry is the wait before the next attempt and the final None re-raises instead of retrying. FULL_SCAN_UPLOAD_MAX_ATTEMPTS is computed from its length. Note: uv.lock is intentionally not regenerated yet - socketdev 3.3.0 must be released to PyPI first (blocked on socket-sdk-python#93). --- CHANGELOG.md | 9 ++- pyproject.toml | 2 +- socketsecurity/core/__init__.py | 71 +++++++----------------- tests/unit/test_full_scan_retry.py | 88 ++++++++++++++++++------------ 4 files changed, 81 insertions(+), 89 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ea6b3b..e8d3d05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,11 +10,14 @@ jitter). Such failures are intermittent and a retried upload almost always succeeds. In these failure modes the server never finished reading the request body, so no scan was created and a retry does not duplicate one; in the rare case where a gateway - timeout races a request the server later - completes, the extra scan is benign and superseded by the retried one (as if the CLI had - run twice). + timeout races a request the server later completes, the extra scan is benign and + superseded by the retried one (as if the CLI had run twice). Non-transient errors (400/401/403/404/429 and error payloads) are never retried. Each retry logs a warning explaining what failed and when the next attempt happens. +- Requires `socketdev>=3.3.0`: the SDK now records the HTTP status code on the exceptions + it raises and owns the transient-vs-deterministic classification + (`APIFailure.is_transient_error()`), so the CLI no longer parses status codes out of + exception message text. ## 2.4.7 diff --git a/pyproject.toml b/pyproject.toml index 18c6332..7876f8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ dependencies = [ 'GitPython', 'packaging', 'python-dotenv', - "socketdev>=3.2.1,<4.0.0", + "socketdev>=3.3.0,<4.0.0", "bs4>=0.0.2", "markdown>=3.10", "brotli>=1.0.9; platform_python_implementation == 'CPython'", diff --git a/socketsecurity/core/__init__.py b/socketsecurity/core/__init__.py index dbf30f7..4e6c032 100644 --- a/socketsecurity/core/__init__.py +++ b/socketsecurity/core/__init__.py @@ -14,12 +14,7 @@ if TYPE_CHECKING: from socketsecurity.config import CliConfig from socketdev import socketdev -from socketdev.exceptions import ( - APIBadGateway, - APIConnectionError, - APIFailure, - APITimeout, -) +from socketdev.exceptions import APIFailure from socketdev.fullscans import FullScanParams, SocketArtifact from socketdev.org import Organization from socketdev.repos import RepositoryInfo @@ -84,44 +79,18 @@ # Full scan upload retry policy. An upload can fail transiently at the gateway/connection # level (an HTTP 502/503/504/408, a dropped or reset connection, or a client-side timeout) -# without the server having created the scan. In these failure modes no scan was created, -# so a retry does not duplicate one. (A duplicate is possible only if a gateway timeout -# races a request the server later completes; that is benign - the retried scan supersedes -# the orphaned one, same as running the CLI twice.) -FULL_SCAN_UPLOAD_MAX_ATTEMPTS = 3 -# Wait before retry attempt 2 and attempt 3 respectively (plus a little jitter so a fleet of -# CI jobs hitting the same failure doesn't retry in lock-step). -FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0) +# without the server having created the scan; the SDK classifies those failures via +# APIFailure.is_transient_error() (socketdev>=3.3.0). In these failure modes no scan was +# created, so a retry does not duplicate one. (A duplicate is possible only if a gateway +# timeout races a request the server later completes; that is benign - the retried scan +# supersedes the orphaned one, same as running the CLI twice.) +# +# Each schedule entry is the wait before the next attempt once the current one fails (plus +# a little jitter so a fleet of CI jobs hitting the same failure doesn't retry in +# lock-step); the final None means the last attempt's failure is re-raised, not retried. +FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0, None) +FULL_SCAN_UPLOAD_MAX_ATTEMPTS = len(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS) FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS = 2.0 -# Transient gateway/timeout HTTP statuses that the SDK does NOT raise as a dedicated -# exception class (502 has APIBadGateway; 408/503/504 surface as the catch-all APIFailure -# with the status only present in the message text - see _is_transient_full_scan_upload_error). -FULL_SCAN_UPLOAD_RETRYABLE_STATUS_CODES = frozenset({408, 503, 504}) -# Matches the status code the SDK embeds in catch-all APIFailure messages -# (socketdev/core/api.py: "Bad Request: HTTP original_status_code: ..."). -_API_FAILURE_STATUS_CODE_RE = re.compile(r"original_status_code:(\d{3})") - - -def _is_transient_full_scan_upload_error(error: Exception) -> bool: - """Whether a full-scan upload failure is transient and safe to retry. - - Transient means the failure happened at the gateway/connection level, normally before the - server finished reading the request body (so no scan was created server-side): HTTP - 502/503/504/408, client-side timeouts, and dropped/reset connections. 4xx client errors - (400/401/403/404/429) and success responses carrying an error payload are never retried. - """ - if isinstance(error, (APIBadGateway, APIConnectionError, APITimeout)): - # 502 / connection reset-dropped / request timeout - the SDK raises dedicated classes. - return True - if type(error) is APIFailure: - # The SDK raises 408/503/504 (and every other status without a dedicated class, - # including 400) as the catch-all APIFailure, so match on the exact class plus the - # status code embedded in the message. Subclasses (APIAccessDenied, APIResourceNotFound, - # APIInsufficientQuota, ...) are deliberately excluded - those are never transient. - match = _API_FAILURE_STATUS_CODE_RE.search(str(error)) - if match: - return int(match.group(1)) in FULL_SCAN_UPLOAD_RETRYABLE_STATUS_CODES - return False def _humanize_alert_type(alert_type: str) -> str: @@ -837,19 +806,19 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths: # Retry transient gateway/timeout failures (502/503/504/408, dropped connections, # timeouts) with increasing waits. In these failure modes the server never finished # reading the request body, so no scan was created and a retry does not duplicate - # one (see the retry-policy comment above FULL_SCAN_UPLOAD_MAX_ATTEMPTS). fullscans.post() - # rebuilds its lazy file loaders from the plain paths in upload_files on every call, - # so simply calling it again per attempt is safe. The loop must stay inside this try - # so the temp .br files (cleaned up in the finally below) outlive every attempt. - for attempt in range(1, FULL_SCAN_UPLOAD_MAX_ATTEMPTS + 1): + # one (see the retry-policy comment above FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS). + # fullscans.post() rebuilds its lazy file loaders from the plain paths in + # upload_files on every call, so simply calling it again per attempt is safe. The + # loop must stay inside this try so the temp .br files (cleaned up in the finally + # below) outlive every attempt. + for attempt, backoff_seconds in enumerate(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS, start=1): try: res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths) break except APIFailure as error: - if attempt >= FULL_SCAN_UPLOAD_MAX_ATTEMPTS or not _is_transient_full_scan_upload_error(error): + if backoff_seconds is None or not error.is_transient_error(): raise - backoff_index = min(attempt, len(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS)) - 1 - wait_seconds = FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS[backoff_index] + random.uniform( + wait_seconds = backoff_seconds + random.uniform( 0, FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS ) # SDK error messages can span many lines (path + response headers); the diff --git a/tests/unit/test_full_scan_retry.py b/tests/unit/test_full_scan_retry.py index 16a6fb3..b31bb11 100644 --- a/tests/unit/test_full_scan_retry.py +++ b/tests/unit/test_full_scan_retry.py @@ -2,9 +2,10 @@ A `POST /orgs//full-scans` upload can fail transiently (an HTTP 502/503/504/408, a dropped or reset connection, or a timeout) without the server having created the scan. -`Core.create_full_scan` retries those transient failures; these tests cover the retry -classification, the loop bounds, and that the temporary brotli-compressed facts files -survive until every attempt has finished. +`Core.create_full_scan` retries the failures the SDK classifies as transient +(`APIFailure.is_transient_error()`, socketdev>=3.3.0); these tests cover the retry +decision, the loop bounds, and that the temporary brotli-compressed facts files survive +until every attempt has finished. """ import logging @@ -26,7 +27,6 @@ SOCKET_FACTS_BROTLI_FILENAME, SOCKET_FACTS_FILENAME, Core, - _is_transient_full_scan_upload_error, ) @@ -46,13 +46,14 @@ def _success_response(): return response -# Catch-all APIFailure messages as the SDK formats them (socketdev/core/api.py); only the -# embedded original_status_code distinguishes a transient 503/504/408 from e.g. a 400. +# Catch-all APIFailure as the SDK raises it for statuses without a dedicated class +# (socketdev/core/api.py); the recorded status_code drives is_transient_error(). def _catch_all_failure(status_code: int) -> APIFailure: return APIFailure( f"Bad Request: HTTP original_status_code:{status_code}\n" f"Path: https://api.socket.dev/v0/orgs/org/full-scans\n\n" - f"Headers:\ncf-ray: abc123" + f"Headers:\ncf-ray: abc123", + status_code=status_code, ) @@ -121,11 +122,14 @@ def test_upload_raises_after_exhausting_attempts( ) -def test_upload_retries_on_catch_all_503(core_with_mock_sdk, tmp_path, no_sleep): +@pytest.mark.parametrize("status_code", [408, 503, 504]) +def test_upload_retries_on_catch_all_transient_statuses( + core_with_mock_sdk, tmp_path, no_sleep, status_code +): manifest = tmp_path / "package.json" manifest.write_text("{}") core_with_mock_sdk.sdk.fullscans.post.side_effect = [ - _catch_all_failure(503), + _catch_all_failure(status_code), _success_response(), ] @@ -164,13 +168,17 @@ def test_upload_does_not_retry_on_400(core_with_mock_sdk, tmp_path, no_sleep): no_sleep.assert_not_called() -@pytest.mark.parametrize("error_class", [APIAccessDenied, APIResourceNotFound]) +@pytest.mark.parametrize( + "error_class,status_code", [(APIAccessDenied, 401), (APIResourceNotFound, 404)] +) def test_upload_does_not_retry_on_dedicated_4xx_classes( - core_with_mock_sdk, tmp_path, no_sleep, error_class + core_with_mock_sdk, tmp_path, no_sleep, error_class, status_code ): manifest = tmp_path / "package.json" manifest.write_text("{}") - core_with_mock_sdk.sdk.fullscans.post.side_effect = error_class() + core_with_mock_sdk.sdk.fullscans.post.side_effect = error_class( + status_code=status_code + ) with pytest.raises(error_class): core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) @@ -241,25 +249,37 @@ def test_temp_br_file_cleaned_after_exhausted_retries( assert not compressed.exists() -def test_transient_classifier(): - assert _is_transient_full_scan_upload_error(APIBadGateway()) - assert _is_transient_full_scan_upload_error(APIConnectionError()) - assert _is_transient_full_scan_upload_error(APITimeout()) - assert _is_transient_full_scan_upload_error(_catch_all_failure(408)) - assert _is_transient_full_scan_upload_error(_catch_all_failure(503)) - assert _is_transient_full_scan_upload_error(_catch_all_failure(504)) - - assert not _is_transient_full_scan_upload_error(_catch_all_failure(400)) - assert not _is_transient_full_scan_upload_error(_catch_all_failure(500)) - assert not _is_transient_full_scan_upload_error( - APIFailure() - ) # wrapped unexpected error - assert not _is_transient_full_scan_upload_error(APIAccessDenied("denied")) - assert not _is_transient_full_scan_upload_error(APIResourceNotFound()) - # Subclasses never match the catch-all branch, even with a retryable-looking message. - assert not _is_transient_full_scan_upload_error( - APIAccessDenied("original_status_code:503") - ) - assert not _is_transient_full_scan_upload_error( - ValueError("original_status_code:503") - ) +class _StubFailure(APIFailure): + """An APIFailure whose transience is fixed, regardless of class or status code.""" + + def __init__(self, transient: bool): + super().__init__("stub failure") + self._transient = transient + + def is_transient_error(self) -> bool: + return self._transient + + +@pytest.mark.parametrize("transient,expected_calls", [(True, 2), (False, 1)]) +def test_retry_decision_delegates_to_sdk_classification( + core_with_mock_sdk, tmp_path, no_sleep, transient, expected_calls +): + # The CLI encodes no knowledge of the SDK's exception hierarchy or status codes: + # the retry decision is exactly APIFailure.is_transient_error(). (The transient / + # non-transient truth table itself is tested in the SDK, next to the code that + # raises the exceptions.) + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = [ + _StubFailure(transient), + _success_response(), + ] + + if transient: + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + assert full_scan.id == "scan-1" + else: + with pytest.raises(_StubFailure): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert core_with_mock_sdk.sdk.fullscans.post.call_count == expected_calls From 6bc8fe831e19651d1c9c120b0abf9ab442b9fecd Mon Sep 17 00:00:00 2001 From: Martin Torp Date: Wed, 10 Jun 2026 13:43:49 +0200 Subject: [PATCH 4/4] Lock socketdev 3.3.0 socketdev 3.3.0 is now released, unblocking the >=3.3.0 floor bump. --- uv.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/uv.lock b/uv.lock index 5f6ffdf..0ed1361 100644 --- a/uv.lock +++ b/uv.lock @@ -1270,15 +1270,15 @@ wheels = [ [[package]] name = "socketdev" -version = "3.2.1" +version = "3.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "requests" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/21/65/07df2bf6e490c56544fb06e4cfde059b2572fdd5b02ff352c766b1d5f7ce/socketdev-3.2.1.tar.gz", hash = "sha256:7db910a98628473e8ec06822deb01b6bd465b385e9e8ea405f2b7526e8258074", size = 179279, upload-time = "2026-06-03T18:08:19.806Z" } +sdist = { url = "https://files.pythonhosted.org/packages/25/30/16155f7f27d18274f364b3bd3506ee45d17f53fc8938aaea9a618054449b/socketdev-3.3.0.tar.gz", hash = "sha256:3d60bd4ac3201e9d581b1fe02bf2e6aef1b90c13ae75d15a8664aa9ef966734e", size = 181519, upload-time = "2026-06-10T11:41:17.942Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/53/01/fff70923755b3a187ca971189fb078a2aaedcad42d682abfdd06f3445def/socketdev-3.2.1-py3-none-any.whl", hash = "sha256:6dc762d78baea8011dc22f2afe49c84c926e640a6879bd7b58c3abdd4e29e8bb", size = 67266, upload-time = "2026-06-03T18:08:18.029Z" }, + { url = "https://files.pythonhosted.org/packages/33/dd/25622e033182e8c744d2420bb4f056206edc096a1e5ce8e4af4b0a0c0791/socketdev-3.3.0-py3-none-any.whl", hash = "sha256:513c045ce42bdd6cc2bb66a527f5863e0c399e56dbdcb1832cd5d94a5fb1a5e4", size = 67956, upload-time = "2026-06-10T11:41:16.534Z" }, ] [[package]] @@ -1340,7 +1340,7 @@ requires-dist = [ { name = "python-dotenv" }, { name = "requests" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.3.0" }, - { name = "socketdev", specifier = ">=3.2.1,<4.0.0" }, + { name = "socketdev", specifier = ">=3.3.0,<4.0.0" }, { name = "twine", marker = "extra == 'dev'" }, { name = "uv", marker = "extra == 'dev'", specifier = ">=0.1.0" }, ]