From 9d040ab3e94c6d07803a2a7fc5783a8ba7ca3b9d Mon Sep 17 00:00:00 2001 From: Rob Taylor Date: Fri, 5 Jun 2026 02:25:21 +0100 Subject: [PATCH 1/2] =?UTF-8?q?ci:=20use=20T4=20+=20xlarge=20runners=20?= =?UTF-8?q?=E2=80=94=20re-enable=20CUDA/HIP,=20de-serialize=20Metal?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gpu-eda team plan adds two GitHub-hosted larger runners: tesla4-runner (4 vCPU + 1 NVIDIA T4) and macos-runner-xlarge (M2 Pro, 14 GB). Put them to work alongside the free self-hosted macos-runner-1. - CUDA Tests + HIP Tests (NVIDIA backend): un-gate (`if: false` removed) and move from the offline nvidia-runner-1 to tesla4-runner, every push. CUDA has had no CI coverage since 2026-05-01; the T4 runs CUDA natively and the HIP-on-NVIDIA codepath (HIP_PLATFORM=nvidia). Native AMD/HIP still needs an AMD runner. - Metal Tests + JTAG Minimal Cosim: conditional runs-on — free self-hosted macos-runner-1 on routine PR pushes (full coverage, no cost), offloading to the billed macos-runner-xlarge on `main` or a `ci:metal-xl`-labelled PR so they run in parallel with the disk-heavy MCU SoC Metal job. That job stays pinned to macos-runner-1 (xlarge has only 14 GB storage). - Add a workflow-level concurrency group (cancel-in-progress per ref) so rapid pushes don't pile up on the self-hosted / billed runners. - Register tesla4-runner + macos-runner-xlarge in .github/actionlint.yaml. Co-developed-by: Claude Code v2.1.162 (claude-opus-4-8) --- .github/actionlint.yaml | 5 +++++ .github/workflows/ci.yml | 40 ++++++++++++++++++++++++++++------------ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index c78912f1..3f26b584 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -2,3 +2,8 @@ self-hosted-runner: labels: - nvidia-runner-1 - macos-runner-1 + # GitHub-hosted GPU larger runner (gpu-eda team plan), added 2026-06-05. + # A custom-named larger runner, so actionlint needs it listed. + # (macos-latest-xlarge is a standard GitHub label — actionlint already + # knows it, so it doesn't go here.) + - tesla4-runner diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 00f177a5..03a4cc2c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,6 +10,12 @@ on: permissions: contents: write +# Cancel superseded in-progress runs for the same ref (PR branch or main) so +# rapid pushes don't pile up on the self-hosted / billed GPU + macOS runners. +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + env: CARGO_TERM_COLOR: always @@ -144,9 +150,14 @@ jobs: # Build and run Metal simulation on macOS metal: name: Metal Tests (macOS) - # Re-enabled 2026-05-12 against the self-hosted chipflow org runner - # macos-runner-1 (Apple Silicon + Metal GPU). - runs-on: macos-runner-1 + # Routine PR pushes run on the free self-hosted macos-runner-1 + # (Apple Silicon + Metal GPU). On `main` — or any PR carrying the + # `ci:metal-xl` label — this light job offloads to the GitHub-hosted + # macos-latest-xlarge (M2 Pro) so it runs in parallel with the + # disk-heavy MCU SoC Metal job (which stays pinned to macos-runner-1, + # since xlarge has only 14 GB storage). This is the billed runner, so + # it's gated to main/label rather than every push. + runs-on: ${{ (github.ref == 'refs/heads/main' || contains(github.event.pull_request.labels.*.name, 'ci:metal-xl')) && 'macos-latest-xlarge' || 'macos-runner-1' }} steps: - uses: actions/checkout@v6 with: @@ -453,10 +464,11 @@ jobs: # Build and run CUDA simulation on NVIDIA GPU cuda: name: CUDA Tests - # TODO(infra): re-enable once self-hosted nvidia-runner-1 is back online. - # Disabled 2026-05-01: GPU runner offline; jobs queue indefinitely. - if: ${{ false }} - runs-on: nvidia-runner-1 + # Re-enabled 2026-06-05 on the GitHub-hosted T4 GPU runner + # (`tesla4-runner`, 4 vCPU + 1 NVIDIA T4). Runs on every push — CUDA + # had no CI coverage from 2026-05-01 (when self-hosted nvidia-runner-1 + # went offline) until this runner landed. + runs-on: tesla4-runner steps: - uses: actions/checkout@v6 with: @@ -544,10 +556,11 @@ jobs: # When an AMD GPU runner becomes available, a native AMD job can be added. hip-on-nvidia: name: HIP Tests (NVIDIA backend) - # TODO(infra): re-enable once self-hosted nvidia-runner-1 is back online. - # Disabled 2026-05-01: GPU runner offline; jobs queue indefinitely. - if: ${{ false }} - runs-on: nvidia-runner-1 + # Re-enabled 2026-06-05 on the GitHub-hosted T4 runner (`tesla4-runner`). + # This job builds the HIP code path with hipcc + HIP_PLATFORM=nvidia, so + # it validates the HIP backend on the same T4. A native AMD GPU job is + # still future work (needs an AMD/ROCm runner). + runs-on: tesla4-runner steps: - uses: actions/checkout@v6 with: @@ -1026,7 +1039,10 @@ jobs: # See tests/jtag_minimal/README.md for full design + regen flow. jtag-minimal-cosim: name: JTAG Minimal Cosim (Hazard3 DTM+DM) - runs-on: macos-runner-1 + # Light Metal cosim: free self-hosted macos-runner-1 on PR pushes; + # offloads to the billed macos-latest-xlarge on main / `ci:metal-xl` + # label (see the `metal` job for the rationale). + runs-on: ${{ (github.ref == 'refs/heads/main' || contains(github.event.pull_request.labels.*.name, 'ci:metal-xl')) && 'macos-latest-xlarge' || 'macos-runner-1' }} timeout-minutes: 20 # Regression gate for #84 (model-driven clock edge flags). continue-on-error: false From d06a8ec97a4560efd6f8eacc0a9a6ea1f3ae5c34 Mon Sep 17 00:00:00 2001 From: Rob Taylor Date: Fri, 5 Jun 2026 02:48:57 +0100 Subject: [PATCH 2/2] ci: run on all pull_request bases (enable stacked-PR CI) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop the `branches: [main, staged-aig-release]` filter on the pull_request trigger. It filtered by *base* branch, so PRs stacked on other feature branches got no CI until they cascaded down to a main base. Plain `pull_request:` runs CI on every PR regardless of base. The push trigger keeps its branch filter (we only want push-CI on main/staged-aig-release, not on every feature-branch push — PRs cover those). Co-developed-by: Claude Code v2.1.162 (claude-opus-4-8) --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 03a4cc2c..36792fb9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,8 +3,9 @@ name: CI on: push: branches: [main, staged-aig-release] + # No base-branch filter: run CI on every PR regardless of its base, so + # stacked PRs (based on other feature branches) get CI automatically. pull_request: - branches: [main, staged-aig-release] workflow_dispatch: permissions: