From 9d040ab3e94c6d07803a2a7fc5783a8ba7ca3b9d Mon Sep 17 00:00:00 2001
From: Rob Taylor <rob.taylor@chipflow.io>
Date: Fri, 5 Jun 2026 02:25:21 +0100
Subject: [PATCH 1/2] =?UTF-8?q?ci:=20use=20T4=20+=20xlarge=20runners=20?=
 =?UTF-8?q?=E2=80=94=20re-enable=20CUDA/HIP,=20de-serialize=20Metal?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The gpu-eda team plan adds two GitHub-hosted larger runners: tesla4-runner
(4 vCPU + 1 NVIDIA T4) and macos-runner-xlarge (M2 Pro, 14 GB). Put them to
work alongside the free self-hosted macos-runner-1.

- CUDA Tests + HIP Tests (NVIDIA backend): un-gate (`if: false` removed) and
  move from the offline nvidia-runner-1 to tesla4-runner, every push. CUDA
  has had no CI coverage since 2026-05-01; the T4 runs CUDA natively and the
  HIP-on-NVIDIA codepath (HIP_PLATFORM=nvidia). Native AMD/HIP still needs an
  AMD runner.
- Metal Tests + JTAG Minimal Cosim: conditional runs-on — free self-hosted
  macos-runner-1 on routine PR pushes (full coverage, no cost), offloading to
  the billed macos-runner-xlarge on `main` or a `ci:metal-xl`-labelled PR so
  they run in parallel with the disk-heavy MCU SoC Metal job. That job stays
  pinned to macos-runner-1 (xlarge has only 14 GB storage).
- Add a workflow-level concurrency group (cancel-in-progress per ref) so
  rapid pushes don't pile up on the self-hosted / billed runners.
- Register tesla4-runner + macos-runner-xlarge in .github/actionlint.yaml.

Co-developed-by: Claude Code v2.1.162 (claude-opus-4-8)
---
 .github/actionlint.yaml  |  5 +++++
 .github/workflows/ci.yml | 40 ++++++++++++++++++++++++++++------------
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index c78912f1..3f26b584 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -2,3 +2,8 @@ self-hosted-runner:
   labels:
     - nvidia-runner-1
     - macos-runner-1
+    # GitHub-hosted GPU larger runner (gpu-eda team plan), added 2026-06-05.
+    # A custom-named larger runner, so actionlint needs it listed.
+    # (macos-latest-xlarge is a standard GitHub label — actionlint already
+    # knows it, so it doesn't go here.)
+    - tesla4-runner
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 00f177a5..03a4cc2c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -10,6 +10,12 @@ on:
 permissions:
   contents: write
 
+# Cancel superseded in-progress runs for the same ref (PR branch or main) so
+# rapid pushes don't pile up on the self-hosted / billed GPU + macOS runners.
+concurrency:
+  group: ci-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 env:
   CARGO_TERM_COLOR: always
 
@@ -144,9 +150,14 @@ jobs:
   # Build and run Metal simulation on macOS
   metal:
     name: Metal Tests (macOS)
-    # Re-enabled 2026-05-12 against the self-hosted chipflow org runner
-    # macos-runner-1 (Apple Silicon + Metal GPU).
-    runs-on: macos-runner-1
+    # Routine PR pushes run on the free self-hosted macos-runner-1
+    # (Apple Silicon + Metal GPU). On `main` — or any PR carrying the
+    # `ci:metal-xl` label — this light job offloads to the GitHub-hosted
+    # macos-latest-xlarge (M2 Pro) so it runs in parallel with the
+    # disk-heavy MCU SoC Metal job (which stays pinned to macos-runner-1,
+    # since xlarge has only 14 GB storage). This is the billed runner, so
+    # it's gated to main/label rather than every push.
+    runs-on: ${{ (github.ref == 'refs/heads/main' || contains(github.event.pull_request.labels.*.name, 'ci:metal-xl')) && 'macos-latest-xlarge' || 'macos-runner-1' }}
     steps:
       - uses: actions/checkout@v6
         with:
@@ -453,10 +464,11 @@ jobs:
   # Build and run CUDA simulation on NVIDIA GPU
   cuda:
     name: CUDA Tests
-    # TODO(infra): re-enable once self-hosted nvidia-runner-1 is back online.
-    # Disabled 2026-05-01: GPU runner offline; jobs queue indefinitely.
-    if: ${{ false }}
-    runs-on: nvidia-runner-1
+    # Re-enabled 2026-06-05 on the GitHub-hosted T4 GPU runner
+    # (`tesla4-runner`, 4 vCPU + 1 NVIDIA T4). Runs on every push — CUDA
+    # had no CI coverage from 2026-05-01 (when self-hosted nvidia-runner-1
+    # went offline) until this runner landed.
+    runs-on: tesla4-runner
     steps:
       - uses: actions/checkout@v6
         with:
@@ -544,10 +556,11 @@ jobs:
   # When an AMD GPU runner becomes available, a native AMD job can be added.
   hip-on-nvidia:
     name: HIP Tests (NVIDIA backend)
-    # TODO(infra): re-enable once self-hosted nvidia-runner-1 is back online.
-    # Disabled 2026-05-01: GPU runner offline; jobs queue indefinitely.
-    if: ${{ false }}
-    runs-on: nvidia-runner-1
+    # Re-enabled 2026-06-05 on the GitHub-hosted T4 runner (`tesla4-runner`).
+    # This job builds the HIP code path with hipcc + HIP_PLATFORM=nvidia, so
+    # it validates the HIP backend on the same T4. A native AMD GPU job is
+    # still future work (needs an AMD/ROCm runner).
+    runs-on: tesla4-runner
     steps:
       - uses: actions/checkout@v6
         with:
@@ -1026,7 +1039,10 @@ jobs:
   # See tests/jtag_minimal/README.md for full design + regen flow.
   jtag-minimal-cosim:
     name: JTAG Minimal Cosim (Hazard3 DTM+DM)
-    runs-on: macos-runner-1
+    # Light Metal cosim: free self-hosted macos-runner-1 on PR pushes;
+    # offloads to the billed macos-latest-xlarge on main / `ci:metal-xl`
+    # label (see the `metal` job for the rationale).
+    runs-on: ${{ (github.ref == 'refs/heads/main' || contains(github.event.pull_request.labels.*.name, 'ci:metal-xl')) && 'macos-latest-xlarge' || 'macos-runner-1' }}
     timeout-minutes: 20
     # Regression gate for #84 (model-driven clock edge flags).
     continue-on-error: false

From d06a8ec97a4560efd6f8eacc0a9a6ea1f3ae5c34 Mon Sep 17 00:00:00 2001
From: Rob Taylor <rob.taylor@chipflow.io>
Date: Fri, 5 Jun 2026 02:48:57 +0100
Subject: [PATCH 2/2] ci: run on all pull_request bases (enable stacked-PR CI)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the `branches: [main, staged-aig-release]` filter on the pull_request
trigger. It filtered by *base* branch, so PRs stacked on other feature
branches got no CI until they cascaded down to a main base. Plain
`pull_request:` runs CI on every PR regardless of base. The push trigger
keeps its branch filter (we only want push-CI on main/staged-aig-release,
not on every feature-branch push — PRs cover those).

Co-developed-by: Claude Code v2.1.162 (claude-opus-4-8)
---
 .github/workflows/ci.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 03a4cc2c..36792fb9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,8 +3,9 @@ name: CI
 on:
   push:
     branches: [main, staged-aig-release]
+  # No base-branch filter: run CI on every PR regardless of its base, so
+  # stacked PRs (based on other feature branches) get CI automatically.
   pull_request:
-    branches: [main, staged-aig-release]
   workflow_dispatch:
 
 permissions: