Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions .github/workflows/javaTests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,19 +98,31 @@ jobs:
- name: Checkout Repository
uses: actions/checkout@v6

- name: ${{ matrix.tests }}
uses: ./.github/action/
id: test
with:
test-to-run: ${{ matrix.tests }}

- name: Clean Github Artifact Name of Asterisks
run: |
ARTIFACT_NAME="transient_jacoco"
ARTIFACT_NAME+="-${{ matrix.tests }}"
ARTIFACT_NAME=${ARTIFACT_NAME//\*/x} # replace * with x
echo "ARTIFACT_NAME=$ARTIFACT_NAME" >> $GITHUB_ENV

- name: ${{ matrix.tests }}
uses: ./.github/action/
id: test
with:
test-to-run: ${{ matrix.tests }}

- name: Save Java Test Diagnostics as Artifact
if: always()
uses: actions/upload-artifact@v7
with:
name: diagnostics-${{ env.ARTIFACT_NAME }}
path: |
target/sysdstest.log
target/thread-dumps/**
target/surefire-reports/**
if-no-files-found: ignore
retention-days: 7

- name: Save Java Test Coverage as Artifact
uses: actions/upload-artifact@v7
with:
Expand Down
96 changes: 92 additions & 4 deletions docker/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ export MAVEN_OPTS="-Xmx512m"
# error), unlike genuine compilation or test failures which fail fast.
transient_mvn_error="Could not transfer artifact"

log="/tmp/sysdstest.log"
target_dir="/github/workspace/target"
mkdir -p "$target_dir"
log="$target_dir/sysdstest.log"
compile_log="$(mktemp)"
# test-compile downloads all dependencies; retry once on a transient repo
# error so the test run below can resolve them from the local cache.
Expand All @@ -51,15 +53,101 @@ if [ "$compile_transient_failure" = true ]; then
else
echo "No transient Maven repository error detected; no retry needed."
fi
mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 \
# Outer guard: catch test-fork hangs that surefire's own timeouts miss, dump
# stacks for diagnosis, and kill the run before the job cap (kept just above the
# 600s per-fork timeout; MAX_RUNTIME is the absolute ceiling under the cap).
STALL_LIMIT="${SYSDS_TEST_STALL_LIMIT:-660}"
MAX_RUNTIME="${SYSDS_TEST_MAX_RUNTIME:-1600}"
dump_dir="$target_dir/thread-dumps"
mkdir -p "$dump_dir"
jstack_bin="${JAVA_HOME:+$JAVA_HOME/bin/}jstack"

# Emit the pid of a process and all of its descendants.
proc_tree() {
local pid=$1 child
for child in $(pgrep -P "$pid" 2>/dev/null); do proc_tree "$child"; done
echo "$pid"
}

# SIGQUIT every JVM in the test tree (stacks relayed into $log) plus a jstack file.
dump_thread_stacks() {
local reason="$1" root="$2" ts pid comm cmd jstack_file
ts=$(date +%Y%m%d-%H%M%S)
echo "================ HARD-GUARD THREAD DUMP: $reason ($ts) ================"
for pid in $(proc_tree "$root"); do
[ -r "/proc/$pid/comm" ] || continue
comm=$(cat "/proc/$pid/comm" 2>/dev/null)
case "$comm" in
java|java.bin) ;;
*) continue ;;
esac
cmd=$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null | cut -c1-160)
echo "---- SIGQUIT dump: pid=$pid comm=$comm cmd=$cmd ----"
kill -3 "$pid" 2>/dev/null
jstack_file="$dump_dir/jstack_${pid}_${ts}.txt"
if timeout 30 "$jstack_bin" -l "$pid" > "$jstack_file" 2>&1; then
echo "---- jstack dump: pid=$pid file=$jstack_file ----"
else
echo "---- jstack dump failed or timed out: pid=$pid file=$jstack_file ----"
fi
cat "$jstack_file" || true
echo "---- end jstack dump: pid=$pid ----"
done
# Let the JVMs flush their dumps into the relayed output stream.
sleep 12
echo "================ END HARD-GUARD THREAD DUMP ($reason) ================"
}

# Background the run so the guard can watch it; $1 stays unquoted to keep the extra -D flags it carries.
( mvn -ntp -B test -D maven.test.skip=false -D automatedtestbase.outputbuffering=true -D test=$1 2>&1 \
| stdbuf -oL grep -Ev "already exists in destination.|Using incubator" \
| tee $log
| tee $log ) &
runner=$!

guard_tripped=false
start=$(date +%s)
prev_lines=-1
idle=0
interval=15
while kill -0 "$runner" 2>/dev/null; do
sleep "$interval"
now=$(date +%s)
runtime=$((now - start))
lines=$(wc -l < "$log" 2>/dev/null || echo 0)
if [ "$lines" -eq "$prev_lines" ]; then
idle=$((idle + interval))
else
idle=0
prev_lines=$lines
fi

reason=""
if [ "$idle" -ge "$STALL_LIMIT" ]; then
reason="no test output for ${idle}s (stall limit ${STALL_LIMIT}s)"
elif [ "$runtime" -ge "$MAX_RUNTIME" ]; then
reason="exceeded absolute runtime ${runtime}s (max ${MAX_RUNTIME}s)"
fi

if [ -n "$reason" ]; then
guard_tripped=true
{
echo ""
echo "##[error] HARD GUARD TRIPPED: $reason"
echo "Last test classes seen before the stall:"
grep -E "Running org.apache" "$log" | tail -5
} | tee -a "$log"
dump_thread_stacks "$reason" "$runner" 2>&1 | tee -a "$log"
for pid in $(proc_tree "$runner"); do kill -9 "$pid" 2>/dev/null; done
break
fi
done
wait "$runner" 2>/dev/null


grep_args="SUCCESS"
grepvals="$( tail -n 100 $log | grep $grep_args)"

if [[ $grepvals == *"SUCCESS"* ]]; then
if [ "$guard_tripped" = false ] && [[ $grepvals == *"SUCCESS"* ]]; then
# Merge Federated test runs.
# if merged jacoco exist temporarily rename to not overwrite.
[ -f target/jacoco.exec ] && mv target/jacoco.exec target/jacoco_main.exec
Expand Down
2 changes: 2 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,8 @@
<!-- 1C means the number of threads times 1 possible maximum forks for testing-->
<forkCount>${test-forkCount}</forkCount>
<reuseForks>false</reuseForks>
<!-- Deterministic class order so a fork hang reproduces at a stable boundary. -->
<runOrder>alphabetical</runOrder>
<!-- Kill hung test forks before CI cancels the whole job. -->
<forkedProcessTimeoutInSeconds>${test-forkedProcessTimeout}</forkedProcessTimeoutInSeconds>
<!-- Force-kill forks that still have live threads after the test class completes. -->
Expand Down
Loading