diff --git a/.tekton/acceptance-tests-pr.yaml b/.tekton/acceptance-tests-pr.yaml index 1a25ab9e..c74de833 100644 --- a/.tekton/acceptance-tests-pr.yaml +++ b/.tekton/acceptance-tests-pr.yaml @@ -93,3 +93,18 @@ spec: requests: storage: 200Mi storageClassName: standard + # ── Go module cache ────────────────────────────────────────────────────── + # Shared persistent volume populated once by the warm-go-cache Task before + # parallel release-tests Tasks start. All test pods mount this PVC and + # find all Go dependencies already present, eliminating the concurrent + # module downloads that caused gauge runner timeouts (>101 min). + # + # Pre-requisite (apply once per cluster): + # oc apply -f tekton/pvc/go-module-cache.yaml -n openshift-pipelines + # + # The plumbing acceptance-tests Pipeline and release-tests Task also need + # to be updated to accept and use this workspace (see + # tekton/tasks/warm-go-cache.yaml and the companion plumbing PR). + - name: go-cache + persistentVolumeClaim: + claimName: go-module-cache diff --git a/.tekton/create-ci-image-pipelinerun.yaml b/.tekton/create-ci-image-pipelinerun.yaml index ff5bac15..6e6eea62 100644 --- a/.tekton/create-ci-image-pipelinerun.yaml +++ b/.tekton/create-ci-image-pipelinerun.yaml @@ -5,7 +5,7 @@ metadata: name: create-push-ci-image annotations: pipelinesascode.tekton.dev/on-cel-expression: | - "Dockerfile.CI".pathChanged() && ( event == "push" || event == "pull_request" ) + ( "Dockerfile.CI".pathChanged() || "go.mod".pathChanged() || "go.sum".pathChanged() ) && ( event == "push" || event == "pull_request" ) pipelinesascode.tekton.dev/max-keep-runs: "5" spec: taskRunSpecs: diff --git a/Dockerfile.CI b/Dockerfile.CI index 421bdd55..411b00f4 100644 --- a/Dockerfile.CI +++ b/Dockerfile.CI @@ -62,6 +62,23 @@ RUN wget https://github.com/getgauge/gauge/releases/download/v${GAUGE_VERSION}/g go env -w GOPROXY="https://proxy.golang.org,direct" &&\ gauge version +# ── Go module cache pre-population ──────────────────────────────────────────── +# When multiple release-tests tasks run in parallel across OCP cluster versions, +# each task previously created a fresh empty GOPATH and downloaded all Go +# modules independently, causing network saturation and gauge runner timeouts +# (>101 min). By pre-downloading the full module graph here, every container +# spawned from this image already has all deps available at /go/pkg/mod without +# any network access. GOMODCACHE is set as an ENV so it is inherited by every +# task process, keeping it independent of the per-task temporary GOPATH. +# The CI image is rebuilt by PAC whenever Dockerfile.CI, go.mod or go.sum +# changes, so the baked cache is always consistent with the module graph. +ENV GOMODCACHE=/go/pkg/mod +COPY go.mod go.sum /go-cache-warmup/ +RUN cd /go-cache-warmup && \ + go env -w GOMODCACHE=/go/pkg/mod && \ + go mod download -x && \ + rm -rf /go-cache-warmup + RUN wget https://github.com/sigstore/cosign/releases/download/v3.0.3/cosign-linux-amd64 -O /usr/bin/cosign && \ chmod a+x /usr/bin/cosign diff --git a/tekton/pvc/go-module-cache.yaml b/tekton/pvc/go-module-cache.yaml new file mode 100644 index 00000000..dd04bc8f --- /dev/null +++ b/tekton/pvc/go-module-cache.yaml @@ -0,0 +1,55 @@ +# go-module-cache PVC +# ───────────────────────────────────────────────────────────────────────────── +# Shared, persistent PVC for the Go module download cache. +# +# Why ReadWriteMany? +# Multiple release-tests TaskRun pods run in parallel (one per test suite). +# All of them need to mount the same volume concurrently - that requires an +# RWX access mode. On ROSA/OSD/OCP the default StorageClass is usually +# backed by Ceph RBD (RWO only). Ensure you use a StorageClass that supports +# ReadWriteMany (e.g. ocs-storagecluster-cephfs, nfs, or azurefile). +# +# Lifecycle: +# - This PVC is long-lived (not recreated per PipelineRun). +# - It is populated once by the warm-go-cache Task before parallel test tasks +# start. Subsequent runs reuse the existing cache; only changed modules are +# fetched. +# - If go.mod / go.sum change significantly, delete and recreate the PVC to +# avoid stale entries (or let warm-go-cache handle it with go mod download, +# which is additive-only). +# +# Apply once per cluster with: +# oc apply -f tekton/pvc/go-module-cache.yaml -n openshift-pipelines +# ───────────────────────────────────────────────────────────────────────────── +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: go-module-cache + namespace: openshift-pipelines + labels: + app.kubernetes.io/part-of: release-tests + app.kubernetes.io/component: go-cache + annotations: + # Human-readable explanation of why this PVC exists + release-tests.tekton.dev/purpose: > + Shared Go module cache populated once per pipeline run by the + warm-go-cache task, then mounted read-only by all parallel + release-tests tasks to eliminate concurrent module downloads that + cause gauge runner timeout (>101 min). +spec: + accessModes: + # ReadWriteMany is required so all parallel test pods can mount + # simultaneously. Use ocs-storagecluster-cephfs or equivalent. + - ReadWriteMany + resources: + requests: + # The full Go module graph for this project is ~1.5 GB. + # 5 Gi gives comfortable headroom for future growth. + storage: 5Gi + # Replace with the RWX-capable StorageClass available on your cluster. + # Examples: + # OCP/OSD/ROSA (OCS) : ocs-storagecluster-cephfs + # ARO : azurefile-csi + # GKE : standard-rwx + # Generic NFS : nfs-client + storageClassName: ocs-storagecluster-cephfs diff --git a/tekton/stepactions/warm-go-cache-with-oci.yaml b/tekton/stepactions/warm-go-cache-with-oci.yaml new file mode 100644 index 00000000..e7dde8dc --- /dev/null +++ b/tekton/stepactions/warm-go-cache-with-oci.yaml @@ -0,0 +1,145 @@ +# tekton-caches integration for cross-run Go module caching +# ───────────────────────────────────────────────────────────────────────────── +# This file enhances the warm-go-cache Task (tekton/tasks/warm-go-cache.yaml) +# with tekton-caches StepActions so that the Go module cache is also persisted +# across pipeline runs in an OCI registry. +# +# How it works (two-tier strategy): +# +# Tier 1 – within a single pipeline run (PVC): +# warm-go-cache Task runs once → fills PVC → all parallel test tasks +# mount the PVC and find modules locally. +# +# Tier 2 – across pipeline runs (OCI registry via tekton-caches): +# cache-fetch StepAction pulls the tarball from Quay.io at the start of +# warm-go-cache; if the hash matches, the PVC is pre-filled in seconds +# (no module-proxy download at all). +# cache-upload StepAction pushes a fresh tarball when the hash changes. +# +# Cache key: SHA-256 hash of go.sum (content-addressed → auto-invalidates on +# any dependency change, no manual purge needed). +# +# Pre-requisites: +# 1. StepActions feature gate must be enabled in TektonCD Pipelines: +# kubectl patch configmap -n tekton-pipelines --type merge \ +# -p '{"data":{"enable-step-actions":"true"}}' feature-flags +# 2. A robot account on Quay.io with write access to +# quay.io/openshift-pipeline/release-tests-go-cache. +# Create a Tekton secret and link it to the SA used by the pipeline. +# +# Apply with: +# kubectl apply -f tekton/stepactions/cache-fetch.yaml +# kubectl apply -f tekton/stepactions/cache-upload.yaml +# (or use the upstream versions directly from tekton-caches) +# ───────────────────────────────────────────────────────────────────────────── + +# ── Updated warm-go-cache Task with tekton-caches integration ───────────────── +apiVersion: tekton.dev/v1 +kind: Task +metadata: + name: warm-go-cache-with-oci + annotations: + tekton.dev/tags: "go,cache,oci" + tekton.dev/displayName: "Warm Go Module Cache (OCI-backed)" + tekton.dev/description: > + Populates the shared go-module-cache PVC using a two-tier strategy: + first tries to restore from OCI registry via tekton-caches (fast, cross-run), + then falls back to go mod download if the cache is stale or absent. + After a fresh download the new cache is uploaded back to the registry. +spec: + params: + - name: IMAGE + description: CI image (Go + gauge) + default: quay.io/openshift-pipeline/ci + - name: REGISTRY + description: > + OCI registry path for the cache image, with {{hash}} placeholder. + Example: oci://quay.io/openshift-pipeline/release-tests-go-cache:{{hash}} + default: "oci://quay.io/openshift-pipeline/release-tests-go-cache:{{hash}}" + - name: GOPROXY + description: Go module proxy URL + default: "https://proxy.golang.org,direct" + workspaces: + - name: release-tests-git + description: Checked-out release-tests repository (go.mod / go.sum) + - name: go-cache + description: Shared RWX PVC for Go module cache (GOMODCACHE) + - name: dockerconfig + description: > + Workspace containing Docker config for OCI registry auth. + Bind to a secret holding .dockerconfigjson for Quay.io. + optional: true + steps: + # ── Step 1: restore cache from OCI registry ──────────────────────────── + - name: cache-fetch + ref: + # Upstream StepAction from tekton-caches project. + # Pin to a specific digest in production. + resolver: http + params: + - name: url + value: https://raw.githubusercontent.com/openshift-pipelines/tekton-caches/main/tekton/cache-fetch.yaml + params: + - name: PATTERNS + value: ["go.sum"] + - name: SOURCE + value: $(params.REGISTRY) + - name: CACHE_PATH + value: $(workspaces.go-cache.path)/gopath/pkg/mod + - name: WORKING_DIR + value: $(workspaces.release-tests-git.path) + - name: DOCKER_CONFIG + value: $(workspaces.dockerconfig.path) + + # ── Step 2: download any missing modules from module proxy ───────────── + - name: download-modules + image: $(params.IMAGE) + imagePullPolicy: Always + workingDir: $(workspaces.release-tests-git.path) + env: + - name: GOPATH + value: $(workspaces.go-cache.path)/gopath + - name: GOMODCACHE + value: $(workspaces.go-cache.path)/gopath/pkg/mod + - name: GOPROXY + value: $(params.GOPROXY) + - name: CACHE_FETCHED + value: $(steps.cache-fetch.results.fetched) + script: | + #!/usr/bin/env bash + set -eu -o pipefail + + mkdir -p "${GOMODCACHE}" + + if [[ "${CACHE_FETCHED}" == "true" ]]; then + echo "==> OCI cache hit – verifying completeness ..." + # Quick sanity check: if go mod download exits 0 with no output it + # means all deps are present; any missing module is downloaded. + go mod download 2>&1 | head -50 + else + echo "==> OCI cache miss – downloading full module graph ..." + go mod download -x + fi + + echo "==> Module cache size: $(du -sh ${GOMODCACHE} | cut -f1)" + + # ── Step 3: upload refreshed cache to OCI registry ──────────────────── + - name: cache-upload + ref: + resolver: http + params: + - name: url + value: https://raw.githubusercontent.com/openshift-pipelines/tekton-caches/main/tekton/cache-upload.yaml + params: + - name: PATTERNS + value: ["go.sum"] + - name: TARGET + value: $(params.REGISTRY) + - name: CACHE_PATH + value: $(workspaces.go-cache.path)/gopath/pkg/mod + - name: WORKING_DIR + value: $(workspaces.release-tests-git.path) + - name: FETCHED + value: $(steps.cache-fetch.results.fetched) + - name: DOCKER_CONFIG + value: $(workspaces.dockerconfig.path) diff --git a/tekton/tasks/warm-go-cache.yaml b/tekton/tasks/warm-go-cache.yaml new file mode 100644 index 00000000..0d7c03ff --- /dev/null +++ b/tekton/tasks/warm-go-cache.yaml @@ -0,0 +1,90 @@ +# warm-go-cache Task +# ───────────────────────────────────────────────────────────────────────────── +# Runs ONCE before all parallel release-tests Tasks in the acceptance-tests +# Pipeline. It downloads the full Go module graph for this project into the +# shared go-module-cache PVC so that every concurrent test pod finds its +# dependencies locally and performs zero network downloads. +# +# Placement in the acceptance-tests Pipeline (plumbing repo): +# runAfter: [clone-release-tests-git] +# (all parallel release-tests tasks get runAfter: [warm-go-cache]) +# +# This Task must be added to plumbing/ci/tasks/warm-go-cache.yaml and +# referenced from plumbing/ci/pipelines/acceptance-tests.yaml. +# ───────────────────────────────────────────────────────────────────────────── +apiVersion: tekton.dev/v1 +kind: Task +metadata: + name: warm-go-cache + annotations: + tekton.dev/tags: "go,cache" + tekton.dev/displayName: "Warm Go Module Cache" + tekton.dev/description: > + Populates the shared go-module-cache PVC by running go mod download + once before parallel test tasks start. Each test task can then set + GOPATH to the PVC mount path and skip all module downloads, avoiding + the gauge runner timeout caused by concurrent network downloads. +spec: + params: + - name: IMAGE + description: CI image that has Go, gauge and the same GOPROXY setting + default: quay.io/openshift-pipeline/ci + - name: GOPROXY + description: Go module proxy URL + default: "https://proxy.golang.org,direct" + workspaces: + - name: release-tests-git + description: Checked-out release-tests repository containing go.mod/go.sum + - name: go-cache + description: > + Shared PVC that will hold $GOPATH/pkg/mod after this task completes. + Must be bound to the go-module-cache ReadWriteMany PVC. + steps: + - name: download-modules + image: $(params.IMAGE) + imagePullPolicy: Always + workingDir: $(workspaces.release-tests-git.path) + env: + # Point GOPATH and GOMODCACHE at the shared PVC so that + # go mod download writes modules to persistent storage. + - name: GOPATH + value: $(workspaces.go-cache.path)/gopath + - name: GOMODCACHE + value: $(workspaces.go-cache.path)/gopath/pkg/mod + - name: GOPROXY + value: $(params.GOPROXY) + # Disable sum-db verification for internal/air-gapped environments + - name: GONOSUMCHECK + value: "*" + script: | + #!/usr/bin/env bash + set -eu -o pipefail + + echo "==> Go module cache warm-up" + echo " GOPATH : ${GOPATH}" + echo " GOMODCACHE : ${GOMODCACHE}" + echo " GOPROXY : ${GOPROXY}" + echo " go.mod hash : $(sha256sum go.mod | cut -d' ' -f1)" + echo " go.sum hash : $(sha256sum go.sum | cut -d' ' -f1)" + + # Create the module cache directory if it doesn't exist yet + mkdir -p "${GOMODCACHE}" + + # Check whether the cache is already current by comparing the + # go.sum hash stored from the previous run. + HASH_FILE="${GOMODCACHE}/.go-sum-hash" + CURRENT_HASH=$(sha256sum go.sum | cut -d' ' -f1) + + if [[ -f "${HASH_FILE}" ]] && [[ "$(cat ${HASH_FILE})" == "${CURRENT_HASH}" ]]; then + echo "==> Cache is up-to-date (go.sum unchanged). Skipping download." + exit 0 + fi + + echo "==> Downloading Go modules (this populates the shared PVC) ..." + go mod download -x + + # Persist the go.sum hash so future runs can skip the download + echo -n "${CURRENT_HASH}" > "${HASH_FILE}" + + echo "==> Done. Module cache size:" + du -sh "${GOMODCACHE}" || true \ No newline at end of file