diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index c78912f1..3f26b584 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -2,3 +2,8 @@ self-hosted-runner: labels: - nvidia-runner-1 - macos-runner-1 + # GitHub-hosted GPU larger runner (gpu-eda team plan), added 2026-06-05. + # A custom-named larger runner, so actionlint needs it listed. + # (macos-latest-xlarge is a standard GitHub label — actionlint already + # knows it, so it doesn't go here.) + - tesla4-runner diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 00f177a5..36792fb9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,13 +3,20 @@ name: CI on: push: branches: [main, staged-aig-release] + # No base-branch filter: run CI on every PR regardless of its base, so + # stacked PRs (based on other feature branches) get CI automatically. pull_request: - branches: [main, staged-aig-release] workflow_dispatch: permissions: contents: write +# Cancel superseded in-progress runs for the same ref (PR branch or main) so +# rapid pushes don't pile up on the self-hosted / billed GPU + macOS runners. +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + env: CARGO_TERM_COLOR: always @@ -144,9 +151,14 @@ jobs: # Build and run Metal simulation on macOS metal: name: Metal Tests (macOS) - # Re-enabled 2026-05-12 against the self-hosted chipflow org runner - # macos-runner-1 (Apple Silicon + Metal GPU). - runs-on: macos-runner-1 + # Routine PR pushes run on the free self-hosted macos-runner-1 + # (Apple Silicon + Metal GPU). On `main` — or any PR carrying the + # `ci:metal-xl` label — this light job offloads to the GitHub-hosted + # macos-latest-xlarge (M2 Pro) so it runs in parallel with the + # disk-heavy MCU SoC Metal job (which stays pinned to macos-runner-1, + # since xlarge has only 14 GB storage). This is the billed runner, so + # it's gated to main/label rather than every push. + runs-on: ${{ (github.ref == 'refs/heads/main' || contains(github.event.pull_request.labels.*.name, 'ci:metal-xl')) && 'macos-latest-xlarge' || 'macos-runner-1' }} steps: - uses: actions/checkout@v6 with: @@ -453,10 +465,11 @@ jobs: # Build and run CUDA simulation on NVIDIA GPU cuda: name: CUDA Tests - # TODO(infra): re-enable once self-hosted nvidia-runner-1 is back online. - # Disabled 2026-05-01: GPU runner offline; jobs queue indefinitely. - if: ${{ false }} - runs-on: nvidia-runner-1 + # Re-enabled 2026-06-05 on the GitHub-hosted T4 GPU runner + # (`tesla4-runner`, 4 vCPU + 1 NVIDIA T4). Runs on every push — CUDA + # had no CI coverage from 2026-05-01 (when self-hosted nvidia-runner-1 + # went offline) until this runner landed. + runs-on: tesla4-runner steps: - uses: actions/checkout@v6 with: @@ -544,10 +557,11 @@ jobs: # When an AMD GPU runner becomes available, a native AMD job can be added. hip-on-nvidia: name: HIP Tests (NVIDIA backend) - # TODO(infra): re-enable once self-hosted nvidia-runner-1 is back online. - # Disabled 2026-05-01: GPU runner offline; jobs queue indefinitely. - if: ${{ false }} - runs-on: nvidia-runner-1 + # Re-enabled 2026-06-05 on the GitHub-hosted T4 runner (`tesla4-runner`). + # This job builds the HIP code path with hipcc + HIP_PLATFORM=nvidia, so + # it validates the HIP backend on the same T4. A native AMD GPU job is + # still future work (needs an AMD/ROCm runner). + runs-on: tesla4-runner steps: - uses: actions/checkout@v6 with: @@ -1026,7 +1040,10 @@ jobs: # See tests/jtag_minimal/README.md for full design + regen flow. jtag-minimal-cosim: name: JTAG Minimal Cosim (Hazard3 DTM+DM) - runs-on: macos-runner-1 + # Light Metal cosim: free self-hosted macos-runner-1 on PR pushes; + # offloads to the billed macos-latest-xlarge on main / `ci:metal-xl` + # label (see the `metal` job for the rationale). + runs-on: ${{ (github.ref == 'refs/heads/main' || contains(github.event.pull_request.labels.*.name, 'ci:metal-xl')) && 'macos-latest-xlarge' || 'macos-runner-1' }} timeout-minutes: 20 # Regression gate for #84 (model-driven clock edge flags). continue-on-error: false