diff --git a/.github/workflows/javaTests.yml b/.github/workflows/javaTests.yml index 0d4c71e946b..db8088d7c23 100644 --- a/.github/workflows/javaTests.yml +++ b/.github/workflows/javaTests.yml @@ -98,12 +98,6 @@ jobs: - name: Checkout Repository uses: actions/checkout@v6 - - name: ${{ matrix.tests }} - uses: ./.github/action/ - id: test - with: - test-to-run: ${{ matrix.tests }} - - name: Clean Github Artifact Name of Asterisks run: | ARTIFACT_NAME="transient_jacoco" @@ -111,6 +105,24 @@ jobs: ARTIFACT_NAME=${ARTIFACT_NAME//\*/x} # replace * with x echo "ARTIFACT_NAME=$ARTIFACT_NAME" >> $GITHUB_ENV + - name: ${{ matrix.tests }} + uses: ./.github/action/ + id: test + with: + test-to-run: ${{ matrix.tests }} + + - name: Save Java Test Diagnostics as Artifact + if: always() + uses: actions/upload-artifact@v7 + with: + name: diagnostics-${{ env.ARTIFACT_NAME }} + path: | + target/sysdstest.log + target/thread-dumps/** + target/surefire-reports/** + if-no-files-found: ignore + retention-days: 7 + - name: Save Java Test Coverage as Artifact uses: actions/upload-artifact@v7 with: diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 53dfabb96e6..71811b7f5d0 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -32,7 +32,9 @@ export MAVEN_OPTS="-Xmx512m" # error), unlike genuine compilation or test failures which fail fast. transient_mvn_error="Could not transfer artifact" -log="/tmp/sysdstest.log" +target_dir="/github/workspace/target" +mkdir -p "$target_dir" +log="$target_dir/sysdstest.log" compile_log="$(mktemp)" # test-compile downloads all dependencies; retry once on a transient repo # error so the test run below can resolve them from the local cache. @@ -51,15 +53,101 @@ if [ "$compile_transient_failure" = true ]; then else echo "No transient Maven repository error detected; no retry needed." fi -mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 \ +# Outer guard: catch test-fork hangs that surefire's own timeouts miss, dump +# stacks for diagnosis, and kill the run before the job cap (kept just above the +# 600s per-fork timeout; MAX_RUNTIME is the absolute ceiling under the cap). +STALL_LIMIT="${SYSDS_TEST_STALL_LIMIT:-660}" +MAX_RUNTIME="${SYSDS_TEST_MAX_RUNTIME:-1600}" +dump_dir="$target_dir/thread-dumps" +mkdir -p "$dump_dir" +jstack_bin="${JAVA_HOME:+$JAVA_HOME/bin/}jstack" + +# Emit the pid of a process and all of its descendants. +proc_tree() { + local pid=$1 child + for child in $(pgrep -P "$pid" 2>/dev/null); do proc_tree "$child"; done + echo "$pid" +} + +# SIGQUIT every JVM in the test tree (stacks relayed into $log) plus a jstack file. +dump_thread_stacks() { + local reason="$1" root="$2" ts pid comm cmd jstack_file + ts=$(date +%Y%m%d-%H%M%S) + echo "================ HARD-GUARD THREAD DUMP: $reason ($ts) ================" + for pid in $(proc_tree "$root"); do + [ -r "/proc/$pid/comm" ] || continue + comm=$(cat "/proc/$pid/comm" 2>/dev/null) + case "$comm" in + java|java.bin) ;; + *) continue ;; + esac + cmd=$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null | cut -c1-160) + echo "---- SIGQUIT dump: pid=$pid comm=$comm cmd=$cmd ----" + kill -3 "$pid" 2>/dev/null + jstack_file="$dump_dir/jstack_${pid}_${ts}.txt" + if timeout 30 "$jstack_bin" -l "$pid" > "$jstack_file" 2>&1; then + echo "---- jstack dump: pid=$pid file=$jstack_file ----" + else + echo "---- jstack dump failed or timed out: pid=$pid file=$jstack_file ----" + fi + cat "$jstack_file" || true + echo "---- end jstack dump: pid=$pid ----" + done + # Let the JVMs flush their dumps into the relayed output stream. + sleep 12 + echo "================ END HARD-GUARD THREAD DUMP ($reason) ================" +} + +# Background the run so the guard can watch it; $1 stays unquoted to keep the extra -D flags it carries. +( mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 \ | stdbuf -oL grep -Ev "already exists in destination.|Using incubator" \ - | tee $log + | tee $log ) & +runner=$! + +guard_tripped=false +start=$(date +%s) +prev_lines=-1 +idle=0 +interval=15 +while kill -0 "$runner" 2>/dev/null; do + sleep "$interval" + now=$(date +%s) + runtime=$((now - start)) + lines=$(wc -l < "$log" 2>/dev/null || echo 0) + if [ "$lines" -eq "$prev_lines" ]; then + idle=$((idle + interval)) + else + idle=0 + prev_lines=$lines + fi + + reason="" + if [ "$idle" -ge "$STALL_LIMIT" ]; then + reason="no test output for ${idle}s (stall limit ${STALL_LIMIT}s)" + elif [ "$runtime" -ge "$MAX_RUNTIME" ]; then + reason="exceeded absolute runtime ${runtime}s (max ${MAX_RUNTIME}s)" + fi + + if [ -n "$reason" ]; then + guard_tripped=true + { + echo "" + echo "##[error] HARD GUARD TRIPPED: $reason" + echo "Last test classes seen before the stall:" + grep -E "Running org.apache" "$log" | tail -5 + } | tee -a "$log" + dump_thread_stacks "$reason" "$runner" 2>&1 | tee -a "$log" + for pid in $(proc_tree "$runner"); do kill -9 "$pid" 2>/dev/null; done + break + fi +done +wait "$runner" 2>/dev/null grep_args="SUCCESS" grepvals="$( tail -n 100 $log | grep $grep_args)" -if [[ $grepvals == *"SUCCESS"* ]]; then +if [ "$guard_tripped" = false ] && [[ $grepvals == *"SUCCESS"* ]]; then # Merge Federated test runs. # if merged jacoco exist temporarily rename to not overwrite. [ -f target/jacoco.exec ] && mv target/jacoco.exec target/jacoco_main.exec diff --git a/pom.xml b/pom.xml index 5762dc2289e..42a00c469ca 100644 --- a/pom.xml +++ b/pom.xml @@ -411,6 +411,8 @@ ${test-forkCount} false + + alphabetical ${test-forkedProcessTimeout}