diff --git a/docs/install.md b/docs/install.md index cb0f4f88..3d7b881b 100644 --- a/docs/install.md +++ b/docs/install.md @@ -110,6 +110,8 @@ Visit [OpenVINO System Requirments](https://docs.openvino.ai/2025/about-openvino export OPENARC_API_KEY="openarc-api-key" # optional — pass --use-api-key to openarc serve start to enforce export OPENARC_AUTOLOAD_MODEL="model_name" # model_name to load on startup export MODEL_PATH="/path/to/your/models" # mount your models to `/models` inside the container + export OPENARC_LOG_FILE="/dev/null" # optional - omit file-based logging, or log to a specific file path + export OPENARC_CONFIG_FILE="/config/file/path/config.json" # optional - use the config file at the specified path docker-compose up --build -d ``` diff --git a/docs/kubernetes.md b/docs/kubernetes.md new file mode 100644 index 00000000..45afffb1 --- /dev/null +++ b/docs/kubernetes.md @@ -0,0 +1,661 @@ +--- +icon: lucide/ship-wheel +--- + +OpenARC can be deployed to Kubernetes. Here are some examples + +=== "Simple deployment" + + This is a relatively simple deployment for running Qwen3-1.7B on a CPU. The model, device, and other parameters can be changed by modifying the initcontainer environment variables. + + ```yaml + --- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: &app_name openarc + labels: &labels + app: *app_name + spec: + selector: + matchLabels: *labels + template: + metadata: + labels: *labels + spec: + initContainers: + - name: prepare-model + image: &image ghcr.io/searchsavior/openarc + env: + - name: MODEL_NAME + value: &model_name qwen3 + - name: MODEL_TYPE + value: llm + - name: ENGINE_NAME + value: ovgenai + - name: DEVICE + value: CPU + - name: HF_REPO_NAME + value: Echo9Zulu/Qwen3-1.7B-int8_asym-ov + - &config_file_var + name: OPENARC_CONFIG_FILE + value: /models/openarc_config.json + # Optional extra args, see https://searchsavior.github.io/OpenArc/commands/#add + # - name: EXTRA_ADD_ARGS + # value: "--runtime-config '{\"MODEL_DISTRIBUTION_POLICY\": \"PIPELINE_PARALLEL\"}'" + command: ["python", "-c"] + args: + - | + import shlex, os + from huggingface_hub import snapshot_download + + model_name = os.environ["MODEL_NAME"] + + # Download the model if it doesn't exist already + model_dir = f"/models/{model_name}" + if not os.path.exists(f"{model_dir}/openvino_model.xml"): + snapshot_download(repo_id=os.environ["HF_REPO_NAME"], local_dir=model_dir) + + # Register the model with OpenArc. The runtime will autoload it on startup. + add_command = [ + "openarc", "add", + # Must match the OPENARC_AUTOLOAD_MODEL env var in the openarc container. + "--model-name", model_name, + "--model-path", model_dir, + "--engine", os.environ["ENGINE_NAME"], + "--model-type", os.environ["MODEL_TYPE"], + "--device", os.environ["DEVICE"] + ] + if "EXTRA_ADD_ARGS" in os.environ: + add_command.extend(shlex.split(os.environ["EXTRA_ADD_ARGS"])) + + os.execlp(add_command[0], *add_command) + volumeMounts: &mounts + - name: models + mountPath: /models + containers: + - name: openarc + image: *image + env: + - name: OPENARC_AUTOLOAD_MODEL + value: *model_name + - *config_file_var + ports: + - containerPort: 8000 + name: http + # TODO add probes once I fix the /status endpoint bug + volumeMounts: *mounts + volumes: + # Holds the model itself + - name: models + emptyDir: {} + --- + apiVersion: v1 + kind: Service + metadata: + name: &app_name openarc + labels: &labels + app: *app_name + spec: + selector: *labels + ports: + - name: http + port: 8000 + ``` + +=== "GPU access via the Intel GPU DRA plugin for Kubernetes" + + OpenARC works with the [Intel GPU DRA plugin for Kubernetes](https://github.com/intel/intel-resource-drivers-for-kubernetes). This allows pods to only be scheduled on nodes + with Intel GPUs available, and allows choosing which GPUs to make available to OpenARC when nodes contain multiple GPUs. This service is not required for GPU access - you can + mount a hostpath to the device, or use the older [Intel GPU device plugin for Kubernetes (non-DRA)](https://github.com/intel/intel-device-plugins-for-kubernetes/blob/main/cmd/gpu_plugin/README.md). + + This requires Kubernetes 1.34 or newer. + + ```yaml + --- + apiVersion: resource.k8s.io/v1 + kind: ResourceClaimTemplate + metadata: + name: openarc-gpu + labels: + app: openarc + spec: + spec: + devices: + requests: + - name: gpu + exactly: + deviceClassName: gpu.intel.com + selectors: + - cel: + # Configure this for your device + expression: device.attributes["gpu.intel.com"].family == 'Arc Pro' + --- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: &app_name openarc + labels: &labels + app: *app_name + spec: + selector: + matchLabels: *labels + template: + metadata: + labels: *labels + spec: + initContainers: + - name: prepare-model + image: &image ghcr.io/searchsavior/openarc + env: + - name: MODEL_NAME + value: &model_name qwen3 + - name: MODEL_TYPE + value: llm + - name: ENGINE_NAME + value: ovgenai + - name: DEVICE + value: GPU + - name: HF_REPO_NAME + value: Echo9Zulu/Qwen3-1.7B-int8_asym-ov + - &config_file_var + name: OPENARC_CONFIG_FILE + value: /models/openarc_config.json + # Optional extra args, see https://searchsavior.github.io/OpenArc/commands/#add + # - name: EXTRA_ADD_ARGS + # value: "--runtime-config '{\"MODEL_DISTRIBUTION_POLICY\": \"PIPELINE_PARALLEL\"}'" + command: ["python", "-c"] + args: + - | + import shlex, os + from huggingface_hub import snapshot_download + + model_name = os.environ["MODEL_NAME"] + + # Download the model if it doesn't exist already + model_dir = f"/models/{model_name}" + if not os.path.exists(f"{model_dir}/openvino_model.xml"): + snapshot_download(repo_id=os.environ["HF_REPO_NAME"], local_dir=model_dir) + + # Register the model with OpenArc. The runtime will autoload it on startup. + add_command = [ + "openarc", "add", + # Must match the OPENARC_AUTOLOAD_MODEL env var in the openarc container. + "--model-name", model_name, + "--model-path", model_dir, + "--engine", os.environ["ENGINE_NAME"], + "--model-type", os.environ["MODEL_TYPE"], + "--device", os.environ["DEVICE"] + ] + if "EXTRA_ADD_ARGS" in os.environ: + add_command.extend(shlex.split(os.environ["EXTRA_ADD_ARGS"])) + + os.execlp(add_command[0], *add_command) + volumeMounts: &mounts + - name: models + mountPath: /models + containers: + - name: openarc + image: *image + env: + - name: OPENARC_AUTOLOAD_MODEL + value: *model_name + - *config_file_var + ports: + - containerPort: 8000 + name: http + # TODO add probes once I fix the /status endpoint bug + volumeMounts: *mounts + # Important: If building a model caching via a separate pod, the GPU must also be made available to the caching pod. + resources: + claims: + - name: gpu + volumes: + # Holds the model itself + - name: models + emptyDir: {} + resourceClaims: + - name: gpu + resourceClaimTemplateName: openarc-gpu + --- + apiVersion: v1 + kind: Service + metadata: + name: &app_name openarc + labels: &labels + app: *app_name + spec: + selector: *labels + ports: + - name: http + port: 8000 + ``` + +=== "Loading images via imageVolumes" + + Kubernetes 1.33 and above supports mounting OCI container images as volumes. This can be used to avoid download/setup initcontainers and jobs. Users are expected to build and publish + their own container images for this purpose. + + Note that it is not generally recommended to cache compiled models within model container images. Compiled models are dependent on device, kernel, kernel module, OpenArc, and model + versions. Changing any one of these invalidates the cache, resulting in models not being portable. If you choose to cache compiled models in a container image, make sure that they are + built against the exact dependency versions (including GPU) that will be used within the Kubernetes cluster. + + Example Dockerfile for building a Qwen3-1.7B image: + + ```Dockerfile + # syntax=docker/dockerfile:1.7 + # Build a self-contained model image: just the downloaded HF model directory + # plus a generated openarc_config.json with a *relative* model_path so the + # image's contents can be mounted into the openarc runtime at any location. + + ARG OPENARC_IMAGE=ghcr.io/searchsavior/openarc + + FROM ${OPENARC_IMAGE} AS builder + + ARG MODEL_NAME=qwen3 + ARG MODEL_TYPE=llm + ARG ENGINE_NAME=ovgenai + ARG DEVICE=CPU + ARG HF_REPO_NAME=Echo9Zulu/Qwen3-1.7B-int8_asym-ov + + ENV MODEL_NAME=${MODEL_NAME} \ + MODEL_TYPE=${MODEL_TYPE} \ + ENGINE_NAME=${ENGINE_NAME} \ + DEVICE=${DEVICE} \ + HF_REPO_NAME=${HF_REPO_NAME} + + WORKDIR /out + + RUN <<'PY' python + import os, shutil, subprocess + from huggingface_hub import snapshot_download + + model_name = os.environ["MODEL_NAME"] + model_dir = f"/out/{model_name}" + + # Download the model into /out/, then drop HF's bookkeeping cache. + snapshot_download(repo_id=os.environ["HF_REPO_NAME"], local_dir=model_dir) + shutil.rmtree(f"{model_dir}/.cache", ignore_errors=True) + + # Generate the config alongside the model so model_path can be a relative + # basename — openarc resolves it against the config file's directory both + # at write-time validation and at runtime, making the image location-independent. + subprocess.run( + [ + "openarc", "add", + "--model-name", model_name, + "--model-path", model_name, + "--engine", os.environ["ENGINE_NAME"], + "--model-type", os.environ["MODEL_TYPE"], + "--device", os.environ["DEVICE"], + ], + check=True, + env={**os.environ, "OPENARC_CONFIG_FILE": "/out/openarc_config.json"}, + ) + PY + + # Final image: only the model directory and config file, nothing else. + FROM scratch + COPY --from=builder /out/ / + ``` + + Build and push the image: + + ```console + $ docker build -f openarc-model.Dockerfile -t ghcr.io//openarc-qwen3-model:latest . + $ docker push ghcr.io//openarc-qwen3-model:latest + ``` + + Or to package a different model from HuggingFace, override the build args: + + ```console + $ docker build \ + -f openarc-model.Dockerfile \ + --build-arg MODEL_NAME=nanbeige \ + --build-arg HF_REPO_NAME=Echo9Zulu/Nanbeige4.1-3B-int4-awq-ov \ + -t ghcr.io//openarc-nanbeige-model:latest \ + . + $ docker push ghcr.io//openarc-nanbeige-model:latest + ``` + + Deployment manifest, now simplified: + + ```yaml + --- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: &app_name openarc + labels: &labels + app: *app_name + spec: + selector: + matchLabels: *labels + template: + metadata: + labels: *labels + spec: + containers: + - name: openarc + image: ghcr.io/searchsavior/openarc + env: + # Must match the model name baked into the model image's openarc_config.json. + - name: OPENARC_AUTOLOAD_MODEL + value: qwen3 + # Point the runtime at the config baked into the model image. + # The config's relative model_path resolves to /model//. + - name: OPENARC_CONFIG_FILE + value: /model/openarc_config.json + ports: + - containerPort: 8000 + name: http + # TODO add probes once I fix the /status endpoint bug + volumeMounts: + - name: model + mountPath: /model + volumes: + - name: model + image: + reference: ghcr.io//openarc-qwen3-model + --- + apiVersion: v1 + kind: Service + metadata: + name: &app_name openarc + labels: &labels + app: *app_name + spec: + selector: *labels + ports: + - name: http + port: 8000 + ``` + +=== "Fully featured, limited-access HA deployment with model caching" + + If your cluster has multiple _identical_ nodes, you can pre-compile models and cache the result. This allows for making the root filesystem readonly, speeds up model loading time, + reduces the peak memory requirements (compilation is expensive), and can be re-used across pod restarts on multiple nodes. + + > [!IMPORTANT] + > Compiled models are dependent on a specific combination of device, kernel, GPU kernel module, libraries, OpenArc, and model versions. A model compiled for an Intel iGPU will not + > work on an Intel discrete GPU. Any change to any of these dependencies will invalidate the model cache. If you upgrade your node's host OS and deploy a new GPU kernel module (i915 + > driver for example), the cache will need to be rebuilt. For details, see [here](https://docs.openvino.ai/2026/model-server/ovms_docs_model_cache.html). + + OpenArc is stateless. Multiple replicas can be deployed for load balancing or to make OpenArc highly-available. + + Here is an example deployment implementing this. Note that the job will need to complete prior to scaling up the deployment, and will need to be re-ran any time the model needs to be + recompiled. Helm annotations are added to show what you'd need to do if packaging this in a chart. This requires Kubernetes 1.35+ and a RWX storage class: + + ```yaml + --- + apiVersion: resource.k8s.io/v1 + kind: ResourceClaimTemplate + metadata: + name: openarc-gpu + labels: + app: openarc + spec: + spec: + devices: + requests: + - name: gpu + exactly: + deviceClassName: gpu.intel.com + selectors: + - cel: + # Configure this for your device + expression: device.attributes["gpu.intel.com"].family == 'Arc Pro' + --- + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: openarc-models + labels: + app: openarc + annotations: + helm.sh/hook: pre-install,pre-upgrade + helm.sh/hook-weight: "-10" + helm.sh/resource-policy: keep + spec: + # Must be either be RWX, or the deployment must be scaled to 0 prior to running upgrade jobs + storageClassName: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi + --- + # Cache-population Job. Downloads the HF model and runs OpenVINO graph + # optimization into /models so the runtime Deployment boots into a warm + # cache and avoids the high-memory cold-compile path. Apply this and + # wait for completion before applying the Deployment. + apiVersion: batch/v1 + kind: Job + metadata: + name: openarc-setup + labels: + app: openarc + annotations: + helm.sh/hook: pre-install,pre-upgrade + helm.sh/hook-weight: "-5" + helm.sh/hook-delete-policy: before-hook-creation + spec: + backoffLimit: 1 + ttlSecondsAfterFinished: 500 + template: + spec: + restartPolicy: Never + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + fsGroupChangePolicy: OnRootMismatch + containers: + - name: openarc + image: ghcr.io/searchsavior/openarc + imagePullPolicy: Always + command: ["python", "-c"] + # Note: The download logic below can be removed if using an image volume. + # If adding a netpol, allow access to `huggingface.co`, `**.huggingface.co`, and `**.hf.co` for model download, along with DNS resolution. + args: + - | + import os, subprocess, sys, time, urllib.request, urllib.error + from huggingface_hub import snapshot_download + from pathlib import Path + + model_name = os.environ["MODEL_NAME"] + model_type = os.environ["MODEL_TYPE"] + hf_repo = os.environ["HF_REPO"] + config_file = os.environ["OPENARC_CONFIG_FILE"] + model_path = os.environ.get("MODEL_PATH", Path(model_name) / "model") + cache_dir = os.environ.get("CACHE_DIR", Path(model_name) / "cache") + engine = os.environ.get("ENGINE", "ovgenai") + device = os.environ.get("DEVICE", "GPU") + + # All paths should be relative to the config file directory + os.chdir(Path(config_file).parent) + + os.makedirs(model_path, exist_ok=True) + snapshot_download(repo_id=hf_repo, local_dir=model_path) + + os.makedirs(os.path.dirname(config_file), exist_ok=True) + # Truncate so stale top-level fields from older schemas don't + # survive a rerun - `openarc add` only updates a single entry. + open(config_file, "w").close() + subprocess.run( + ["openarc", "add", + f"--model-name={model_name}", + f"--model-path={model_path}", + f"--cache-dir={cache_dir}", + f"--engine={engine}", + f"--model-type={model_type}", + f"--device={device}"], + check=True, + ) + + os.makedirs(cache_dir, exist_ok=True) + + # Start the server and wait for it to be ready before trying to load the model + server = subprocess.Popen( + ["openarc", "serve", "start", "--host", "0.0.0.0", "--port", "8000"], + env={**os.environ, "MODEL_NAME": model_name} + ) + try: + for i in range(1, 61): + try: + urllib.request.urlopen("http://localhost:8000/v1/models", timeout=2).read() + break + except urllib.error.URLError: + time.sleep(1) + else: + sys.exit("Server never became ready within 60s") + + print(f"Server ready after {i}s; loading {model_name}", flush=True) + subprocess.run(["openarc", "load", model_name], check=True) + + # Verify that the model loaded correctly (and therefore compiled correctly) + resp = urllib.request.urlopen("http://localhost:8000/openarc/status", timeout=5).read().decode() + if f'"model_name":"{model_name}"' in resp and '"status":"loaded"' in resp: + print(f"Model loaded; OV cache populated at {cache_dir}") + else: + sys.exit(f"Model failed to load: {resp}") + finally: + server.terminate() + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + env: + - name: MODEL_NAME + value: qwen3 + - name: MODEL_TYPE + value: llm + - name: HF_REPO + value: Echo9Zulu/Qwen3-1.7B-int8_asym-ov + - name: OPENARC_CONFIG_FILE + value: /models/openarc_config.json + - name: OPENARC_LOG_FILE + value: /dev/null + - name: NUMBA_CACHE_DIR + value: /tmp/numba-cache + # huggingface-cli writes to HF_HOME; the default ~/.cache path + # is unwritable under readOnlyRootFilesystem. + - name: HF_HOME + value: /tmp/hf-home + resources: + requests: + cpu: 500m + # Compilation requires a significant amount of memory depending on the model. By doing this once, + # the peak memory usage for every deployment replica is much lower. + memory: &memory 6Gi + limits: + memory: *memory + claims: + - name: gpu + volumeMounts: + - name: models + mountPath: /models + - name: tmp + mountPath: /tmp + volumes: + - name: models + persistentVolumeClaim: + claimName: openarc-models + - name: tmp + emptyDir: {} + resourceClaims: + - name: gpu + resourceClaimTemplateName: openarc-gpu + --- + apiVersion: apps/v1 + kind: Deployment + metadata: + name: &app_name openarc + labels: &labels + app: *app_name + spec: + replicas: 2 # Optional + selector: + matchLabels: *labels + template: + metadata: + labels: *labels + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + fsGroupChangePolicy: OnRootMismatch + containers: + - name: openarc + image: ghcr.io/searchsavior/openarc + env: + - name: OPENARC_AUTOLOAD_MODEL + value: qwen3 + - name: OPENARC_CONFIG_FILE + value: /models/openarc_config.json + - name: OPENARC_LOG_FILE + value: /dev/null + - name: NUMBA_CACHE_DIR + value: /tmp/numba-cache + ports: + - containerPort: 8000 + name: http + # TODO add probes once I fix the /status endpoint bug + volumeMounts: + - name: models + mountPath: /models + - name: tmp + mountPath: /tmp + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + resources: + requests: + cpu: 100m + memory: &memory 2Gi + limits: + memory: *memory + claims: + - name: gpu + volumes: + - name: models + persistentVolumeClaim: + claimName: openarc-models + # Note: Audio transcriptions requests for ASR models will store audio contents here for audio files > 1MB. + - name: tmp + emptyDir: {} + resourceClaims: + - name: gpu + resourceClaimTemplateName: openarc-gpu + --- + apiVersion: policy/v1 + kind: PodDisruptionBudget + metadata: + name: &app_name openarc + labels: &labels + app: *app_name + spec: + minAvailable: 1 + selector: + matchLabels: *labels + --- + apiVersion: v1 + kind: Service + metadata: + name: &app_name openarc + labels: &labels + app: *app_name + spec: + selector: *labels + ports: + - name: http + port: 8000 + ```