From 45a250804eada36a222e3309a6afbe5c7e87176f Mon Sep 17 00:00:00 2001 From: Max Isbey <224885523+maxisbey@users.noreply.github.com> Date: Wed, 1 Jul 2026 14:26:59 +0000 Subject: [PATCH 1/4] De-flake conformance CI: solo re-verification, spawn-storm reduction, result artifacts The client conformance suite launches every scenario's client subprocess concurrently; on a 2-vCPU runner the resulting contention intermittently pushes sse-retry's reconnect-timing measurement past its tolerance and fails the job (~4-5% of runs, including pushes to main). - .github/actions/conformance/run-client.sh: wraps the client suite legs. On failure, scenarios listed as unexpected failures are re-run alone on the then-quiet runner: a real failure fails again; a contention artifact passes and the job goes green with a FLAKE_RESCUED marker saved into the results directory. Stale-baseline errors and infra failures are never retried, and an unparseable failure list falls back to the original exit code, so the wrapper cannot green-wash anything but solo-verified passes. - conformance.yml: uv sync gains --compile-bytecode and the editable sources are pre-compiled, so ~40 concurrent interpreters stop racing to byte-compile the same modules during the measurement window; the client command execs the synced venv's interpreter directly instead of paying uv's lockfile re-check in every spawn; both jobs save --output-dir results and upload them when the job fails or a flake was rescued. - everything-server: test_sampling passes related_request_id so the sampling request rides the originating tools/call SSE stream instead of racing the client's standalone GET stream (a dropped request there hangs tools-call-sampling to the 60s client timeout). --- .github/actions/conformance/run-client.sh | 114 ++++++++++++++++++ .github/workflows/conformance.yml | 47 +++++++- .../mcp_everything_server/server.py | 6 +- 3 files changed, 161 insertions(+), 6 deletions(-) create mode 100755 .github/actions/conformance/run-client.sh diff --git a/.github/actions/conformance/run-client.sh b/.github/actions/conformance/run-client.sh new file mode 100755 index 000000000..0ef1b0f37 --- /dev/null +++ b/.github/actions/conformance/run-client.sh @@ -0,0 +1,114 @@ +#!/bin/bash +# Run a client conformance suite, re-verifying unexpected failures solo. +# +# Suite mode launches every scenario's client subprocess concurrently; on a +# 2-vCPU runner that contention can push scenarios with real-time waits (the +# SSE reconnect timing in sse-retry) past their tolerances. So a scenario the +# suite run flags as an unexpected failure is re-run alone on the then-quiet +# runner: a real failure fails again and the job stays red; a contention +# artifact passes and the job goes green, with a FLAKE_RESCUED marker written +# into the --output-dir so the artifact upload preserves the evidence. +# Failures that only reproduce under concurrency are deliberately traded +# away - the suite asserts spec compliance, not behavior under parallel load. +set -uo pipefail + +: "${CONFORMANCE_PKG:?set CONFORMANCE_PKG (pinned in .github/workflows/conformance.yml)}" +SOLO_ATTEMPTS="${CONFORMANCE_SOLO_ATTEMPTS:-2}" + +# Relative paths in the arguments (the client command, --output-dir) resolve +# from the repo root, same contract as run-server.sh. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR/../../.." || exit 1 + +log="$(mktemp)" +trap 'rm -f "$log"' EXIT + +npx --yes "$CONFORMANCE_PKG" client "$@" 2>&1 | tee "$log" +rc=${PIPESTATUS[0]} +if [ "$rc" -eq 0 ]; then + exit 0 +fi + +plain="$(sed 's/\x1b\[[0-9;]*m//g' "$log")" + +# Scenarios listed under "Unexpected failures (not in baseline):". Anything +# else behind the nonzero exit (stale baseline entries, harness or infra +# errors) is not retried. The extraction is coupled to the pinned harness's +# summary wording and print order; if a pin bump changes either, the list +# comes up empty and the original failure passes through - never a false +# green. +mapfile -t scenarios < <( + printf '%s\n' "$plain" | + sed -n '/^Unexpected failures (not in baseline):$/,/^$/p' | + sed -n 's/^ ✗ //p' +) +if [ "${#scenarios[@]}" -eq 0 ]; then + exit "$rc" +fi +for scenario in "${scenarios[@]}"; do + if ! [[ "$scenario" =~ ^[A-Za-z0-9/_-]+$ ]]; then + echo "Extracted unexpected-failure name '${scenario}' does not look like a scenario name; passing the suite failure through." >&2 + exit "$rc" + fi +done + +# A stale baseline entry is a configuration error a solo rerun cannot excuse. +if printf '%s\n' "$plain" | grep -q '^Stale baseline entries'; then + echo "Suite also reported stale baseline entries; not retrying." >&2 + exit "$rc" +fi + +# Reuse the suite invocation's arguments for the solo runs, minus the flags +# that only make sense for a suite (--scenario replaces --suite; single runs +# are judged directly, not against the baseline). Solo results are saved next +# to the suite's so the uploaded artifact carries both. +rerun_args=() +output_dir="" +skip_next=0 +expect_output_dir=0 +for arg in "$@"; do + if [ "$skip_next" -eq 1 ]; then + if [ "$expect_output_dir" -eq 1 ]; then + output_dir="$arg" + fi + skip_next=0 + expect_output_dir=0 + continue + fi + case "$arg" in + --output-dir) + skip_next=1 + expect_output_dir=1 + ;; + --suite | --expected-failures) skip_next=1 ;; + --output-dir=*) output_dir="${arg#--output-dir=}" ;; + --suite=* | --expected-failures=*) ;; + *) rerun_args+=("$arg") ;; + esac +done +if [ -n "$output_dir" ]; then + rerun_args+=(--output-dir "${output_dir}-solo") +fi + +for scenario in "${scenarios[@]}"; do + passed=0 + for attempt in $(seq 1 "$SOLO_ATTEMPTS"); do + echo "" + echo "Re-running '${scenario}' solo (attempt ${attempt}/${SOLO_ATTEMPTS})..." + if npx --yes "$CONFORMANCE_PKG" client --scenario "$scenario" "${rerun_args[@]}"; then + passed=1 + break + fi + done + if [ "$passed" -ne 1 ]; then + echo "'${scenario}' still fails when run alone: real failure, not suite contention." >&2 + exit 1 + fi +done + +if [ -n "$output_dir" ]; then + mkdir -p "$output_dir" + printf '%s\n' "${scenarios[@]}" > "$output_dir/FLAKE_RESCUED" +fi +echo "All ${#scenarios[@]} unexpected failure(s) passed when re-run solo; the suite failures were parallel-run contention." +exit 0 diff --git a/.github/workflows/conformance.yml b/.github/workflows/conformance.yml index 35f8b6dcc..c80da0813 100644 --- a/.github/workflows/conformance.yml +++ b/.github/workflows/conformance.yml @@ -64,17 +64,20 @@ jobs: ./.github/actions/conformance/run-server.sh --suite active --expected-failures ./.github/actions/conformance/expected-failures.yml + --output-dir conformance-results/server-active - name: Run server conformance (draft suite) run: >- ./.github/actions/conformance/run-server.sh --suite draft --expected-failures ./.github/actions/conformance/expected-failures.yml + --output-dir conformance-results/server-draft - name: Run server conformance (2026-07-28 wire, all suite) run: >- ./.github/actions/conformance/run-server.sh --suite all --spec-version 2026-07-28 --expected-failures ./.github/actions/conformance/expected-failures.2026-07-28.yml + --output-dir conformance-results/server-2026-07-28 - name: Run server conformance (all suite, extension scenarios) # A bare `--suite all` (no --spec-version) selects every scenario # shipped with the pinned harness — including the extension-tagged @@ -91,6 +94,16 @@ jobs: ./.github/actions/conformance/run-server.sh --suite all --expected-failures ./.github/actions/conformance/expected-failures.yml + --output-dir conformance-results/server-all + - name: Upload conformance results + # The suite summary only prints counts for warning-level findings; the + # per-check measurements live in the checks.json files saved above. + if: failure() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: server-conformance-results + path: conformance-results/ + if-no-files-found: ignore client-conformance: runs-on: ubuntu-latest @@ -118,22 +131,46 @@ jobs: echo "CONFORMANCE_PKG=file:/tmp/conformance.tgz" >> "$GITHUB_ENV" ;; esac - - run: uv sync --frozen --all-extras --package mcp + # --compile-bytecode: the harness spawns every scenario's client + # concurrently; without pre-compiled site-packages, ~40 fresh + # interpreters race to byte-compile the same modules on a 2-core + # runner, saturating it for the first ~20s of the suite — exactly when + # timing-sensitive scenarios take their measurements. + - run: uv sync --frozen --all-extras --package mcp --compile-bytecode + - name: Pre-compile bytecode (editable sources) + run: uv run --frozen python -m compileall -q src .github/actions/conformance - name: Run client conformance (all suite) # The harness runs all scenarios via unbounded Promise.all; with 40 # scenarios on a 2-core runner the slowest one (sse-retry, which has a # real-time SSE reconnect wait) needs more than the 30s default budget. + # The client command execs the synced venv's interpreter directly: + # `uv run` would re-check the lockfile in every one of the ~40 + # concurrent spawns, compounding the startup storm. run-client.sh + # re-verifies unexpected failures solo before failing the job. run: >- - npx --yes "$CONFORMANCE_PKG" client - --command 'uv run --frozen python .github/actions/conformance/client.py' + ./.github/actions/conformance/run-client.sh + --command '.venv/bin/python .github/actions/conformance/client.py' --suite all --timeout 60000 --expected-failures ./.github/actions/conformance/expected-failures.yml + --output-dir conformance-results/client-all - name: Run client conformance (2026-07-28 wire, all suite) run: >- - npx --yes "$CONFORMANCE_PKG" client - --command 'uv run --frozen python .github/actions/conformance/client.py' + ./.github/actions/conformance/run-client.sh + --command '.venv/bin/python .github/actions/conformance/client.py' --suite all --timeout 60000 --spec-version 2026-07-28 --expected-failures ./.github/actions/conformance/expected-failures.2026-07-28.yml + --output-dir conformance-results/client-2026-07-28 + - name: Upload conformance results + # The suite summary only prints counts for warning-level findings; the + # per-check measurements live in the checks.json files saved above. + # Also upload when run-client.sh rescued a flake (job green, but the + # contention evidence should not be discarded). + if: failure() || hashFiles('conformance-results/**/FLAKE_RESCUED') != '' + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: client-conformance-results + path: conformance-results/ + if-no-files-found: ignore diff --git a/examples/servers/everything-server/mcp_everything_server/server.py b/examples/servers/everything-server/mcp_everything_server/server.py index 8621c877a..cb2a84b3e 100644 --- a/examples/servers/everything-server/mcp_everything_server/server.py +++ b/examples/servers/everything-server/mcp_everything_server/server.py @@ -191,10 +191,14 @@ async def test_tool_with_progress(ctx: Context) -> str: async def test_sampling(prompt: str, ctx: Context) -> str: """Tests server-initiated sampling (LLM completion request)""" try: - # Request sampling from client + # Request sampling from client. related_request_id routes the request onto + # the originating tools/call SSE stream, which exists for the whole handler; + # without it the request targets the standalone GET stream and is dropped if + # the client has not finished opening that stream yet. result = await ctx.session.create_message( # pyright: ignore[reportDeprecated] messages=[SamplingMessage(role="user", content=TextContent(type="text", text=prompt))], max_tokens=100, + related_request_id=ctx.request_id, ) # Since we're not passing tools param, result.content is single content From 4eeca68a6cc85d806b690ea96a9637f5962a3cbc Mon Sep 17 00:00:00 2001 From: Max Isbey <224885523+maxisbey@users.noreply.github.com> Date: Wed, 1 Jul 2026 14:30:22 +0000 Subject: [PATCH 2/4] Trim comments to the load-bearing lines --- .github/actions/conformance/run-client.sh | 30 +++++-------------- .github/workflows/conformance.yml | 22 +++++--------- .../mcp_everything_server/server.py | 6 ++-- log.txt | 12 ++++++++ 4 files changed, 29 insertions(+), 41 deletions(-) create mode 100644 log.txt diff --git a/.github/actions/conformance/run-client.sh b/.github/actions/conformance/run-client.sh index 0ef1b0f37..6cc2f58db 100755 --- a/.github/actions/conformance/run-client.sh +++ b/.github/actions/conformance/run-client.sh @@ -1,22 +1,14 @@ #!/bin/bash # Run a client conformance suite, re-verifying unexpected failures solo. -# -# Suite mode launches every scenario's client subprocess concurrently; on a -# 2-vCPU runner that contention can push scenarios with real-time waits (the -# SSE reconnect timing in sse-retry) past their tolerances. So a scenario the -# suite run flags as an unexpected failure is re-run alone on the then-quiet -# runner: a real failure fails again and the job stays red; a contention -# artifact passes and the job goes green, with a FLAKE_RESCUED marker written -# into the --output-dir so the artifact upload preserves the evidence. -# Failures that only reproduce under concurrency are deliberately traded -# away - the suite asserts spec compliance, not behavior under parallel load. +# Concurrent suite runs on a 2-vCPU runner can push scenarios with real-time +# waits past tolerance; solo, a real failure fails again while a contention +# artifact passes. Failures that only reproduce under concurrency are excused. set -uo pipefail : "${CONFORMANCE_PKG:?set CONFORMANCE_PKG (pinned in .github/workflows/conformance.yml)}" SOLO_ATTEMPTS="${CONFORMANCE_SOLO_ATTEMPTS:-2}" -# Relative paths in the arguments (the client command, --output-dir) resolve -# from the repo root, same contract as run-server.sh. +# Relative args resolve from the repo root; same contract as run-server.sh. SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$SCRIPT_DIR/../../.." || exit 1 @@ -31,12 +23,8 @@ fi plain="$(sed 's/\x1b\[[0-9;]*m//g' "$log")" -# Scenarios listed under "Unexpected failures (not in baseline):". Anything -# else behind the nonzero exit (stale baseline entries, harness or infra -# errors) is not retried. The extraction is coupled to the pinned harness's -# summary wording and print order; if a pin bump changes either, the list -# comes up empty and the original failure passes through - never a false -# green. +# If the harness's summary wording changes, the list comes up empty and the +# original exit code passes through - never a false green. mapfile -t scenarios < <( printf '%s\n' "$plain" | sed -n '/^Unexpected failures (not in baseline):$/,/^$/p' | @@ -58,10 +46,8 @@ if printf '%s\n' "$plain" | grep -q '^Stale baseline entries'; then exit "$rc" fi -# Reuse the suite invocation's arguments for the solo runs, minus the flags -# that only make sense for a suite (--scenario replaces --suite; single runs -# are judged directly, not against the baseline). Solo results are saved next -# to the suite's so the uploaded artifact carries both. +# Drop the suite-only flags: --scenario replaces --suite, and solo runs are +# judged directly rather than against the baseline. rerun_args=() output_dir="" skip_next=0 diff --git a/.github/workflows/conformance.yml b/.github/workflows/conformance.yml index c80da0813..dd132698d 100644 --- a/.github/workflows/conformance.yml +++ b/.github/workflows/conformance.yml @@ -96,8 +96,7 @@ jobs: --expected-failures ./.github/actions/conformance/expected-failures.yml --output-dir conformance-results/server-all - name: Upload conformance results - # The suite summary only prints counts for warning-level findings; the - # per-check measurements live in the checks.json files saved above. + # The log has only summary counts; per-check data is in checks.json. if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: @@ -131,11 +130,8 @@ jobs: echo "CONFORMANCE_PKG=file:/tmp/conformance.tgz" >> "$GITHUB_ENV" ;; esac - # --compile-bytecode: the harness spawns every scenario's client - # concurrently; without pre-compiled site-packages, ~40 fresh - # interpreters race to byte-compile the same modules on a 2-core - # runner, saturating it for the first ~20s of the suite — exactly when - # timing-sensitive scenarios take their measurements. + # --compile-bytecode: without it, ~40 concurrently spawned interpreters + # race to byte-compile site-packages during the timing-sensitive window. - run: uv sync --frozen --all-extras --package mcp --compile-bytecode - name: Pre-compile bytecode (editable sources) run: uv run --frozen python -m compileall -q src .github/actions/conformance @@ -143,10 +139,8 @@ jobs: # The harness runs all scenarios via unbounded Promise.all; with 40 # scenarios on a 2-core runner the slowest one (sse-retry, which has a # real-time SSE reconnect wait) needs more than the 30s default budget. - # The client command execs the synced venv's interpreter directly: - # `uv run` would re-check the lockfile in every one of the ~40 - # concurrent spawns, compounding the startup storm. run-client.sh - # re-verifies unexpected failures solo before failing the job. + # `.venv/bin/python` (not `uv run`) avoids lockfile re-checks in ~40 + # concurrent spawns; run-client.sh re-runs unexpected failures solo. run: >- ./.github/actions/conformance/run-client.sh --command '.venv/bin/python .github/actions/conformance/client.py' @@ -164,10 +158,8 @@ jobs: --expected-failures ./.github/actions/conformance/expected-failures.2026-07-28.yml --output-dir conformance-results/client-2026-07-28 - name: Upload conformance results - # The suite summary only prints counts for warning-level findings; the - # per-check measurements live in the checks.json files saved above. - # Also upload when run-client.sh rescued a flake (job green, but the - # contention evidence should not be discarded). + # The log has only summary counts; per-check data is in checks.json. + # Also on FLAKE_RESCUED: rescued-flake evidence is otherwise discarded. if: failure() || hashFiles('conformance-results/**/FLAKE_RESCUED') != '' uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: diff --git a/examples/servers/everything-server/mcp_everything_server/server.py b/examples/servers/everything-server/mcp_everything_server/server.py index cb2a84b3e..a14fa487d 100644 --- a/examples/servers/everything-server/mcp_everything_server/server.py +++ b/examples/servers/everything-server/mcp_everything_server/server.py @@ -191,10 +191,8 @@ async def test_tool_with_progress(ctx: Context) -> str: async def test_sampling(prompt: str, ctx: Context) -> str: """Tests server-initiated sampling (LLM completion request)""" try: - # Request sampling from client. related_request_id routes the request onto - # the originating tools/call SSE stream, which exists for the whole handler; - # without it the request targets the standalone GET stream and is dropped if - # the client has not finished opening that stream yet. + # Request sampling from client. Without related_request_id the request goes + # to the standalone GET stream and is silently dropped if it is not open yet. result = await ctx.session.create_message( # pyright: ignore[reportDeprecated] messages=[SamplingMessage(role="user", content=TextContent(type="text", text=prompt))], max_tokens=100, diff --git a/log.txt b/log.txt new file mode 100644 index 000000000..45b17f748 --- /dev/null +++ b/log.txt @@ -0,0 +1,12 @@ +Total: 1 passed, 2 failed, 0 warnings + +Expected failures (in baseline): + ~ foo + +Stale baseline entries (now passing - remove from baseline): + ✓ bar + +Unexpected failures (not in baseline): + ✗ sse-retry + +Baseline is stale: update your expected-failures file to remove passing scenarios. From b4558ce811b1e92ecce02cc007c41b780ada3e4c Mon Sep 17 00:00:00 2001 From: Max Isbey <224885523+maxisbey@users.noreply.github.com> Date: Wed, 1 Jul 2026 14:53:45 +0000 Subject: [PATCH 3/4] Address review: drop stray fixture, gitignore results, single solo attempt A solo failure on the quiet runner already disproves the contention hypothesis, so a second attempt was the blind retry the wrapper exists to avoid; CONFORMANCE_SOLO_ATTEMPTS remains the override. --- .github/actions/conformance/run-client.sh | 4 +++- .gitignore | 3 +++ log.txt | 12 ------------ 3 files changed, 6 insertions(+), 13 deletions(-) delete mode 100644 log.txt diff --git a/.github/actions/conformance/run-client.sh b/.github/actions/conformance/run-client.sh index 6cc2f58db..3092dbf86 100755 --- a/.github/actions/conformance/run-client.sh +++ b/.github/actions/conformance/run-client.sh @@ -6,7 +6,9 @@ set -uo pipefail : "${CONFORMANCE_PKG:?set CONFORMANCE_PKG (pinned in .github/workflows/conformance.yml)}" -SOLO_ATTEMPTS="${CONFORMANCE_SOLO_ATTEMPTS:-2}" +# One attempt: a solo failure on the quiet runner disproves the contention +# hypothesis; a second try would be the blind retry this script avoids. +SOLO_ATTEMPTS="${CONFORMANCE_SOLO_ATTEMPTS:-1}" # Relative args resolve from the repo root; same contract as run-server.sh. SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" diff --git a/.gitignore b/.gitignore index 3443adf7c..684f8d7b0 100644 --- a/.gitignore +++ b/.gitignore @@ -173,3 +173,6 @@ cython_debug/ # claude code results/ + +# conformance CI local runs +conformance-results/ diff --git a/log.txt b/log.txt deleted file mode 100644 index 45b17f748..000000000 --- a/log.txt +++ /dev/null @@ -1,12 +0,0 @@ -Total: 1 passed, 2 failed, 0 warnings - -Expected failures (in baseline): - ~ foo - -Stale baseline entries (now passing - remove from baseline): - ✓ bar - -Unexpected failures (not in baseline): - ✗ sse-retry - -Baseline is stale: update your expected-failures file to remove passing scenarios. From a2072c30d48fd95c2d3cfc9b1c67c4db5c1b3ae3 Mon Sep 17 00:00:00 2001 From: Max Isbey <224885523+maxisbey@users.noreply.github.com> Date: Wed, 1 Jul 2026 15:55:12 +0000 Subject: [PATCH 4/4] Keep the stale-baseline guard fail-closed under pipefail grep -q quitting at the first match SIGPIPEs the printf feeding it; with pipefail the pipeline reports 141 and the guard is skipped exactly when the pattern is present. A here-string has no writer to kill. --- .github/actions/conformance/run-client.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/actions/conformance/run-client.sh b/.github/actions/conformance/run-client.sh index 3092dbf86..3c9678877 100755 --- a/.github/actions/conformance/run-client.sh +++ b/.github/actions/conformance/run-client.sh @@ -43,7 +43,9 @@ for scenario in "${scenarios[@]}"; do done # A stale baseline entry is a configuration error a solo rerun cannot excuse. -if printf '%s\n' "$plain" | grep -q '^Stale baseline entries'; then +# Here-string, not a pipe: grep -q quitting early would SIGPIPE printf and, +# under pipefail, skip this guard exactly when the pattern is present. +if grep -q '^Stale baseline entries' <<<"$plain"; then echo "Suite also reported stale baseline entries; not retrying." >&2 exit "$rc" fi