diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
index 53dfabb96e6..f80913d6f6b 100755
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@@ -51,15 +51,94 @@ if [ "$compile_transient_failure" = true ]; then
else
echo "No transient Maven repository error detected; no retry needed."
fi
-mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 \
+# Outer guard: catch test-fork hangs that surefire's own timeouts miss, dump
+# stacks for diagnosis, and kill the run before the job cap (kept just above the
+# 600s per-fork timeout; MAX_RUNTIME is the absolute ceiling under the cap).
+STALL_LIMIT="${SYSDS_TEST_STALL_LIMIT:-660}"
+MAX_RUNTIME="${SYSDS_TEST_MAX_RUNTIME:-1600}"
+dump_dir="/github/workspace/target/thread-dumps"
+mkdir -p "$dump_dir"
+jstack_bin="${JAVA_HOME:+$JAVA_HOME/bin/}jstack"
+
+# Emit the pid of a process and all of its descendants.
+proc_tree() {
+ local pid=$1 child
+ for child in $(pgrep -P "$pid" 2>/dev/null); do proc_tree "$child"; done
+ echo "$pid"
+}
+
+# SIGQUIT every JVM in the test tree (stacks relayed into $log) plus a jstack file.
+dump_thread_stacks() {
+ local reason="$1" root="$2" ts pid comm cmd
+ ts=$(date +%Y%m%d-%H%M%S)
+ echo "================ HARD-GUARD THREAD DUMP: $reason ($ts) ================"
+ for pid in $(proc_tree "$root"); do
+ [ -r "/proc/$pid/comm" ] || continue
+ comm=$(cat "/proc/$pid/comm" 2>/dev/null)
+ case "$comm" in
+ java|java.bin) ;;
+ *) continue ;;
+ esac
+ cmd=$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null | cut -c1-160)
+ echo "---- SIGQUIT dump: pid=$pid comm=$comm cmd=$cmd ----"
+ kill -3 "$pid" 2>/dev/null
+ timeout 30 "$jstack_bin" -l "$pid" > "$dump_dir/jstack_${pid}_${ts}.txt" 2>&1 || true
+ done
+ # Let the JVMs flush their dumps into the relayed output stream.
+ sleep 12
+ echo "================ END HARD-GUARD THREAD DUMP ($reason) ================"
+}
+
+# Background the run so the guard can watch it; $1 stays unquoted to keep the extra -D flags it carries.
+( mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 \
| stdbuf -oL grep -Ev "already exists in destination.|Using incubator" \
- | tee $log
+ | tee $log ) &
+runner=$!
+
+guard_tripped=false
+start=$(date +%s)
+prev_lines=-1
+idle=0
+interval=15
+while kill -0 "$runner" 2>/dev/null; do
+ sleep "$interval"
+ now=$(date +%s)
+ runtime=$((now - start))
+ lines=$(wc -l < "$log" 2>/dev/null || echo 0)
+ if [ "$lines" -eq "$prev_lines" ]; then
+ idle=$((idle + interval))
+ else
+ idle=0
+ prev_lines=$lines
+ fi
+
+ reason=""
+ if [ "$idle" -ge "$STALL_LIMIT" ]; then
+ reason="no test output for ${idle}s (stall limit ${STALL_LIMIT}s)"
+ elif [ "$runtime" -ge "$MAX_RUNTIME" ]; then
+ reason="exceeded absolute runtime ${runtime}s (max ${MAX_RUNTIME}s)"
+ fi
+
+ if [ -n "$reason" ]; then
+ guard_tripped=true
+ {
+ echo ""
+ echo "##[error] HARD GUARD TRIPPED: $reason"
+ echo "Last test classes seen before the stall:"
+ grep -E "Running org.apache" "$log" | tail -5
+ } | tee -a "$log"
+ dump_thread_stacks "$reason" "$runner" 2>&1 | tee -a "$log"
+ for pid in $(proc_tree "$runner"); do kill -9 "$pid" 2>/dev/null; done
+ break
+ fi
+done
+wait "$runner" 2>/dev/null
grep_args="SUCCESS"
grepvals="$( tail -n 100 $log | grep $grep_args)"
-if [[ $grepvals == *"SUCCESS"* ]]; then
+if [ "$guard_tripped" = false ] && [[ $grepvals == *"SUCCESS"* ]]; then
# Merge Federated test runs.
# if merged jacoco exist temporarily rename to not overwrite.
[ -f target/jacoco.exec ] && mv target/jacoco.exec target/jacoco_main.exec
diff --git a/pom.xml b/pom.xml
index 5762dc2289e..42a00c469ca 100644
--- a/pom.xml
+++ b/pom.xml
@@ -411,6 +411,8 @@
${test-forkCount}
false
+
+ alphabetical
${test-forkedProcessTimeout}