diff --git a/CHANGELOG.md b/CHANGELOG.md index 060d486..e8d3d05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,24 @@ # Changelog +## 2.4.8 + +### Fixed: retry transient full-scan upload failures + +- The full-scan upload (`POST /orgs//full-scans`) now retries transient + gateway/connection failures — HTTP 502/503/504/408, dropped or reset connections, and + request timeouts — up to 3 total attempts with increasing waits (~10s, then ~30s, plus + jitter). Such failures are intermittent and a retried upload almost always succeeds. + In these failure modes the server never finished reading the request body, so no scan + was created and a retry does not duplicate one; in the rare case where a gateway + timeout races a request the server later completes, the extra scan is benign and + superseded by the retried one (as if the CLI had run twice). + Non-transient errors (400/401/403/404/429 and error payloads) are never retried. Each + retry logs a warning explaining what failed and when the next attempt happens. +- Requires `socketdev>=3.3.0`: the SDK now records the HTTP status code on the exceptions + it raises and owns the transient-vs-deterministic classification + (`APIFailure.is_transient_error()`), so the CLI no longer parses status codes out of + exception message text. + ## 2.4.7 ### Changed: pin @coana-tech/cli version; auto-update is now opt-in diff --git a/pyproject.toml b/pyproject.toml index 6bed831..7876f8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "hatchling.build" [project] name = "socketsecurity" -version = "2.4.7" +version = "2.4.8" requires-python = ">= 3.11" license = {"file" = "LICENSE"} dependencies = [ @@ -16,7 +16,7 @@ dependencies = [ 'GitPython', 'packaging', 'python-dotenv', - "socketdev>=3.2.1,<4.0.0", + "socketdev>=3.3.0,<4.0.0", "bs4>=0.0.2", "markdown>=3.10", "brotli>=1.0.9; platform_python_implementation == 'CPython'", diff --git a/socketsecurity/__init__.py b/socketsecurity/__init__.py index 91c7faf..9f1797f 100644 --- a/socketsecurity/__init__.py +++ b/socketsecurity/__init__.py @@ -1,3 +1,3 @@ __author__ = 'socket.dev' -__version__ = '2.4.7' +__version__ = '2.4.8' USER_AGENT = f'SocketPythonCLI/{__version__}' diff --git a/socketsecurity/core/__init__.py b/socketsecurity/core/__init__.py index 20ff28f..4e6c032 100644 --- a/socketsecurity/core/__init__.py +++ b/socketsecurity/core/__init__.py @@ -1,5 +1,6 @@ import logging import os +import random import re import sys import tarfile @@ -76,6 +77,21 @@ TIER1_FINALIZE_MAX_ATTEMPTS = 3 TIER1_FINALIZE_BACKOFF_SECONDS = 1.0 +# Full scan upload retry policy. An upload can fail transiently at the gateway/connection +# level (an HTTP 502/503/504/408, a dropped or reset connection, or a client-side timeout) +# without the server having created the scan; the SDK classifies those failures via +# APIFailure.is_transient_error() (socketdev>=3.3.0). In these failure modes no scan was +# created, so a retry does not duplicate one. (A duplicate is possible only if a gateway +# timeout races a request the server later completes; that is benign - the retried scan +# supersedes the orphaned one, same as running the CLI twice.) +# +# Each schedule entry is the wait before the next attempt once the current one fails (plus +# a little jitter so a fleet of CI jobs hitting the same failure doesn't retry in +# lock-step); the final None means the last attempt's failure is re-raised, not retried. +FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0, None) +FULL_SCAN_UPLOAD_MAX_ATTEMPTS = len(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS) +FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS = 2.0 + def _humanize_alert_type(alert_type: str) -> str: """Convert a camelCase/PascalCase alert type into a Title-Cased label. @@ -787,7 +803,33 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths: # facts file under the per-file upload size cap. See _compress_facts_files_for_upload. upload_files, compressed_temp_files = self._compress_facts_files_for_upload(files) try: - res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths) + # Retry transient gateway/timeout failures (502/503/504/408, dropped connections, + # timeouts) with increasing waits. In these failure modes the server never finished + # reading the request body, so no scan was created and a retry does not duplicate + # one (see the retry-policy comment above FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS). + # fullscans.post() rebuilds its lazy file loaders from the plain paths in + # upload_files on every call, so simply calling it again per attempt is safe. The + # loop must stay inside this try so the temp .br files (cleaned up in the finally + # below) outlive every attempt. + for attempt, backoff_seconds in enumerate(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS, start=1): + try: + res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths) + break + except APIFailure as error: + if backoff_seconds is None or not error.is_transient_error(): + raise + wait_seconds = backoff_seconds + random.uniform( + 0, FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS + ) + # SDK error messages can span many lines (path + response headers); the + # first line carries the status, which is all the warning needs. + error_summary = str(error).strip().splitlines()[0] if str(error).strip() else "" + log.warning( + f"Full scan upload failed with {type(error).__name__}({error_summary}), " + f"retrying in {wait_seconds:.0f}s " + f"(attempt {attempt + 1}/{FULL_SCAN_UPLOAD_MAX_ATTEMPTS})" + ) + time.sleep(wait_seconds) finally: for temp_file in compressed_temp_files: try: diff --git a/tests/unit/test_full_scan_retry.py b/tests/unit/test_full_scan_retry.py new file mode 100644 index 0000000..b31bb11 --- /dev/null +++ b/tests/unit/test_full_scan_retry.py @@ -0,0 +1,285 @@ +"""Tests for the full-scan upload retry on transient gateway/connection failures. + +A `POST /orgs//full-scans` upload can fail transiently (an HTTP 502/503/504/408, a +dropped or reset connection, or a timeout) without the server having created the scan. +`Core.create_full_scan` retries the failures the SDK classifies as transient +(`APIFailure.is_transient_error()`, socketdev>=3.3.0); these tests cover the retry +decision, the loop bounds, and that the temporary brotli-compressed facts files survive +until every attempt has finished. +""" + +import logging +from unittest.mock import MagicMock + +import pytest +from socketdev.exceptions import ( + APIAccessDenied, + APIBadGateway, + APIConnectionError, + APIFailure, + APIResourceNotFound, + APITimeout, +) +from socketdev.fullscans import FullScanMetadata + +from socketsecurity.core import ( + FULL_SCAN_UPLOAD_MAX_ATTEMPTS, + SOCKET_FACTS_BROTLI_FILENAME, + SOCKET_FACTS_FILENAME, + Core, +) + + +def _success_response(): + metadata = FullScanMetadata( + id="scan-1", + created_at="2026-01-01T00:00:00Z", + updated_at="2026-01-01T00:00:00Z", + organization_id="org-1", + repository_id="repo-1", + branch="main", + html_report_url="https://socket.dev/report", + ) + response = MagicMock() + response.success = True + response.data = metadata + return response + + +# Catch-all APIFailure as the SDK raises it for statuses without a dedicated class +# (socketdev/core/api.py); the recorded status_code drives is_transient_error(). +def _catch_all_failure(status_code: int) -> APIFailure: + return APIFailure( + f"Bad Request: HTTP original_status_code:{status_code}\n" + f"Path: https://api.socket.dev/v0/orgs/org/full-scans\n\n" + f"Headers:\ncf-ray: abc123", + status_code=status_code, + ) + + +@pytest.fixture +def core_with_mock_sdk(): + # Build a Core without running org setup; we only exercise create_full_scan. + core = Core.__new__(Core) + core.sdk = MagicMock() + core.cli_config = None # skip the tier1 finalize branch + return core + + +@pytest.fixture(autouse=True) +def no_sleep(mocker): + return mocker.patch("socketsecurity.core.time.sleep") + + +def test_upload_succeeds_first_try(core_with_mock_sdk, tmp_path, no_sleep): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.return_value = _success_response() + + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert full_scan.id == "scan-1" + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1 + no_sleep.assert_not_called() + + +def test_upload_retries_on_502_then_succeeds( + core_with_mock_sdk, tmp_path, no_sleep, caplog +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = [ + APIBadGateway(), + APIBadGateway(), + _success_response(), + ] + + with caplog.at_level(logging.WARNING, logger="socketdev"): + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert full_scan.id == "scan-1" + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 3 + assert no_sleep.call_count == 2 # waits before attempts 2 and 3 + retry_warnings = [r for r in caplog.records if "retrying in" in r.message] + assert len(retry_warnings) == 2 + assert "APIBadGateway" in retry_warnings[0].message + assert f"(attempt 2/{FULL_SCAN_UPLOAD_MAX_ATTEMPTS})" in retry_warnings[0].message + + +def test_upload_raises_after_exhausting_attempts( + core_with_mock_sdk, tmp_path, no_sleep +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = APIBadGateway() + + with pytest.raises(APIBadGateway): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert ( + core_with_mock_sdk.sdk.fullscans.post.call_count + == FULL_SCAN_UPLOAD_MAX_ATTEMPTS + ) + + +@pytest.mark.parametrize("status_code", [408, 503, 504]) +def test_upload_retries_on_catch_all_transient_statuses( + core_with_mock_sdk, tmp_path, no_sleep, status_code +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = [ + _catch_all_failure(status_code), + _success_response(), + ] + + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert full_scan.id == "scan-1" + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 2 + + +@pytest.mark.parametrize("error_class", [APIConnectionError, APITimeout]) +def test_upload_retries_on_connection_level_errors( + core_with_mock_sdk, tmp_path, no_sleep, error_class +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = [ + error_class(), + _success_response(), + ] + + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert full_scan.id == "scan-1" + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 2 + + +def test_upload_does_not_retry_on_400(core_with_mock_sdk, tmp_path, no_sleep): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = _catch_all_failure(400) + + with pytest.raises(APIFailure): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1 + no_sleep.assert_not_called() + + +@pytest.mark.parametrize( + "error_class,status_code", [(APIAccessDenied, 401), (APIResourceNotFound, 404)] +) +def test_upload_does_not_retry_on_dedicated_4xx_classes( + core_with_mock_sdk, tmp_path, no_sleep, error_class, status_code +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = error_class( + status_code=status_code + ) + + with pytest.raises(error_class): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1 + no_sleep.assert_not_called() + + +def test_upload_does_not_retry_on_error_payload(core_with_mock_sdk, tmp_path, no_sleep): + # A response that came back but reports failure (res.success False) is not transient. + manifest = tmp_path / "package.json" + manifest.write_text("{}") + failed = MagicMock() + failed.success = False + failed.message = "tarball too large" + failed.status = 200 + core_with_mock_sdk.sdk.fullscans.post.return_value = failed + + with pytest.raises(Exception, match="tarball too large"): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1 + no_sleep.assert_not_called() + + +def test_temp_br_file_survives_retries_and_is_cleaned_after( + core_with_mock_sdk, tmp_path, no_sleep +): + # The brotli-compressed facts sibling must stay on disk across every retry attempt + # (the SDK re-reads it per call) and only be deleted once all attempts finished. + facts = tmp_path / SOCKET_FACTS_FILENAME + facts.write_text('{"components": []}') + compressed = tmp_path / SOCKET_FACTS_BROTLI_FILENAME + br_present_per_attempt = [] + + def post_side_effect(upload_files, *args, **kwargs): + br_present_per_attempt.append(compressed.is_file()) + assert str(compressed) in upload_files + if len(br_present_per_attempt) < 3: + raise APIBadGateway() + return _success_response() + + core_with_mock_sdk.sdk.fullscans.post.side_effect = post_side_effect + + full_scan = core_with_mock_sdk.create_full_scan([str(facts)], MagicMock()) + + assert full_scan.id == "scan-1" + assert br_present_per_attempt == [True, True, True] + assert not compressed.exists() # cleaned up after the attempts finished + assert facts.is_file() # the original facts file is never touched + + +def test_temp_br_file_cleaned_after_exhausted_retries( + core_with_mock_sdk, tmp_path, no_sleep +): + facts = tmp_path / SOCKET_FACTS_FILENAME + facts.write_text('{"components": []}') + compressed = tmp_path / SOCKET_FACTS_BROTLI_FILENAME + core_with_mock_sdk.sdk.fullscans.post.side_effect = APIBadGateway() + + with pytest.raises(APIBadGateway): + core_with_mock_sdk.create_full_scan([str(facts)], MagicMock()) + + assert ( + core_with_mock_sdk.sdk.fullscans.post.call_count + == FULL_SCAN_UPLOAD_MAX_ATTEMPTS + ) + assert not compressed.exists() + + +class _StubFailure(APIFailure): + """An APIFailure whose transience is fixed, regardless of class or status code.""" + + def __init__(self, transient: bool): + super().__init__("stub failure") + self._transient = transient + + def is_transient_error(self) -> bool: + return self._transient + + +@pytest.mark.parametrize("transient,expected_calls", [(True, 2), (False, 1)]) +def test_retry_decision_delegates_to_sdk_classification( + core_with_mock_sdk, tmp_path, no_sleep, transient, expected_calls +): + # The CLI encodes no knowledge of the SDK's exception hierarchy or status codes: + # the retry decision is exactly APIFailure.is_transient_error(). (The transient / + # non-transient truth table itself is tested in the SDK, next to the code that + # raises the exceptions.) + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = [ + _StubFailure(transient), + _success_response(), + ] + + if transient: + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + assert full_scan.id == "scan-1" + else: + with pytest.raises(_StubFailure): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert core_with_mock_sdk.sdk.fullscans.post.call_count == expected_calls diff --git a/uv.lock b/uv.lock index 29cd641..0ed1361 100644 --- a/uv.lock +++ b/uv.lock @@ -1270,20 +1270,20 @@ wheels = [ [[package]] name = "socketdev" -version = "3.2.1" +version = "3.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "requests" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/21/65/07df2bf6e490c56544fb06e4cfde059b2572fdd5b02ff352c766b1d5f7ce/socketdev-3.2.1.tar.gz", hash = "sha256:7db910a98628473e8ec06822deb01b6bd465b385e9e8ea405f2b7526e8258074", size = 179279, upload-time = "2026-06-03T18:08:19.806Z" } +sdist = { url = "https://files.pythonhosted.org/packages/25/30/16155f7f27d18274f364b3bd3506ee45d17f53fc8938aaea9a618054449b/socketdev-3.3.0.tar.gz", hash = "sha256:3d60bd4ac3201e9d581b1fe02bf2e6aef1b90c13ae75d15a8664aa9ef966734e", size = 181519, upload-time = "2026-06-10T11:41:17.942Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/53/01/fff70923755b3a187ca971189fb078a2aaedcad42d682abfdd06f3445def/socketdev-3.2.1-py3-none-any.whl", hash = "sha256:6dc762d78baea8011dc22f2afe49c84c926e640a6879bd7b58c3abdd4e29e8bb", size = 67266, upload-time = "2026-06-03T18:08:18.029Z" }, + { url = "https://files.pythonhosted.org/packages/33/dd/25622e033182e8c744d2420bb4f056206edc096a1e5ce8e4af4b0a0c0791/socketdev-3.3.0-py3-none-any.whl", hash = "sha256:513c045ce42bdd6cc2bb66a527f5863e0c399e56dbdcb1832cd5d94a5fb1a5e4", size = 67956, upload-time = "2026-06-10T11:41:16.534Z" }, ] [[package]] name = "socketsecurity" -version = "2.4.7" +version = "2.4.8" source = { editable = "." } dependencies = [ { name = "brotli", marker = "platform_python_implementation == 'CPython'" }, @@ -1340,7 +1340,7 @@ requires-dist = [ { name = "python-dotenv" }, { name = "requests" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.3.0" }, - { name = "socketdev", specifier = ">=3.2.1,<4.0.0" }, + { name = "socketdev", specifier = ">=3.3.0,<4.0.0" }, { name = "twine", marker = "extra == 'dev'" }, { name = "uv", marker = "extra == 'dev'", specifier = ">=0.1.0" }, ]