From 31eac91ecbd93b52f6cb917da5786b152caf5bb5 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Mon, 8 Dec 2025 23:37:11 -0500 Subject: [PATCH 001/108] Add NFS cache support for CI sync data sharing - Add NFS cache variables (DATA_CACHE_NFS_PREFIX, SYNC_CACHE_KEY, SYNC_CACHE_TYPE) - Add HAF_COMMIT variable for service containers - Update HAF include ref to feature/nfs-cache-manager branch - Add validate_haf_commit job to ensure commit consistency - Update sync job with NFS cache fetch/push logic - Replace hfm-only-service with haf-instance-with-nfs-fallback service - Update all test jobs to use new service and haf-instance hostname - Change builder tags from hive-builder-9 to fast - Add docker/overrides/ci.yml for CI-specific HAF configuration - Update HAF submodule to feature/nfs-cache-manager branch --- .gitlab-ci.yml | 163 +++++++++++++++++++++++++++++----------- .gitmodules | 1 + docker/overrides/ci.yml | 33 ++++++++ submodules/haf | 2 +- 4 files changed, 154 insertions(+), 45 deletions(-) create mode 100644 docker/overrides/ci.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2dc4c54b..5c4bf018 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,17 +15,24 @@ variables: GIT_SUBMODULE_UPDATE_FLAGS: --jobs 4 # HAF configuration DATA_CACHE_HAF_PREFIX: "/cache/replay_data_haf" + # NFS cache configuration for sync data sharing across builders + DATA_CACHE_NFS_PREFIX: "/nfs/ci-cache" + SYNC_CACHE_KEY: "${HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}" + SYNC_CACHE_TYPE: "haf_sync" BLOCK_LOG_SOURCE_DIR_5M: /blockchain/block_log_5m FF_NETWORK_PER_BUILD: 1 PYTEST_NUMBER_OF_PROCESSES: 8 # uses registry.gitlab.syncad.com/hive/haf/ci-base-image:ubuntu24.04-1 BUILDER_IMAGE_TAG: "$TEST_HAF_IMAGE_TAG" BUILDER_IMAGE_PATH: "registry.gitlab.syncad.com/hive/haf/ci-base-image${BUILDER_IMAGE_TAG}" + # HAF submodule commit - must match the 'ref:' in the include section below + # This is needed for service containers which can't access dotenv artifacts + HAF_COMMIT: "2c55a49ab1661a184a8bd6e9cf1434d88209a5b8" include: - template: Workflows/Branch-Pipelines.gitlab-ci.yml - project: hive/haf - ref: bf820442979eff6c7cb7e387f26cd4ccf9345f3c # develop + ref: 2c55a49ab1661a184a8bd6e9cf1434d88209a5b8 # feature/nfs-cache-manager file: /scripts/ci-helpers/prepare_data_image_job.yml # implicitly pulls templates/base.gitlab-ci.yml from common-ci-configuration # Do not include common-ci-configuration here, it is already referenced by scripts/ci-helpers/prepare_data_image_job.yml included from Haf/Hive repos @@ -68,6 +75,34 @@ lint_sql_scripts: paths: - sql-lint.yaml +validate_haf_commit: + stage: build + image: alpine:latest + script: + - | + set -e + apk add --no-cache git + SUBMODULE_COMMIT=$(cat .git/modules/submodules/haf/HEAD 2>/dev/null || git -C submodules/haf rev-parse HEAD) + INCLUDE_REF=$(grep -A2 "project:.*hive/haf" .gitlab-ci.yml | grep "ref:" | head -1 | sed 's/.*ref: *\([a-f0-9]*\).*/\1/' || true) + echo "HAF_COMMIT variable: $HAF_COMMIT" + echo "HAF submodule HEAD: $SUBMODULE_COMMIT" + echo "Include ref: $INCLUDE_REF" + ERRORS=0 + if [ "$HAF_COMMIT" != "$SUBMODULE_COMMIT" ]; then + echo "ERROR: HAF_COMMIT variable does not match submodule commit!" + ERRORS=1 + fi + if [ "$HAF_COMMIT" != "$INCLUDE_REF" ]; then + echo "ERROR: HAF_COMMIT variable does not match include ref!" + ERRORS=1 + fi + if [ $ERRORS -eq 1 ]; then + exit 1 + fi + echo "All HAF commit references are consistent" + tags: + - public-runner-docker + prepare_haf_image: stage: build extends: .prepare_haf_image @@ -88,13 +123,14 @@ prepare_haf_data: - job: prepare_haf_image artifacts: true stage: build + timeout: 80m variables: SUBMODULE_DIR: "$CI_PROJECT_DIR/submodules/haf" BLOCK_LOG_SOURCE_DIR: $BLOCK_LOG_SOURCE_DIR_5M CONFIG_INI_SOURCE: "$CI_PROJECT_DIR/submodules/haf/docker/config_5M.ini" tags: - data-cache-storage - - hive-builder-9 + - fast .docker-base-build-template: extends: .docker_image_builder_job_template @@ -223,7 +259,7 @@ sync: HAF_SHM_DIRECTORY: ${SHM_DIR} BACKEND_VERSION: "$CI_COMMIT_SHORT_SHA" POSTGRES_ACCESS: postgresql://haf_admin@docker:5432/haf_block_log - COMPOSE_OPTIONS_STRING: --env-file ci.env --file docker-compose.yml --file overrides/dev.yml + COMPOSE_OPTIONS_STRING: --env-file ci.env --file docker-compose.yml --file overrides/ci.yml --ansi never timeout: 1 hours before_script: @@ -235,16 +271,43 @@ sync: git config --global --add safe.directory "$CI_PROJECT_DIR" git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf" echo -e "\e[0Ksection_end:$(date +%s):git\r\e[0K" + - | + # Ensure HAF replay data is available locally (fetch from NFS if needed) + LOCAL_HAF_CACHE="${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT}" + if [[ -d "${LOCAL_HAF_CACHE}/datadir" ]]; then + echo "Local HAF cache found at ${LOCAL_HAF_CACHE}" + else + echo "Local HAF cache not found, checking NFS..." + CACHE_MANAGER="${CI_PROJECT_DIR}/submodules/haf/scripts/ci-helpers/cache-manager.sh" + if [[ -x "$CACHE_MANAGER" ]]; then + if "$CACHE_MANAGER" get haf "${HAF_COMMIT}" "${LOCAL_HAF_CACHE}"; then + echo "Fetched HAF replay data from NFS cache" + else + echo "ERROR: Failed to fetch HAF replay data from NFS cache" + exit 1 + fi + else + echo "ERROR: cache-manager.sh not found and local cache missing" + exit 1 + fi + fi script: - | echo -e "\e[0Ksection_start:$(date +%s):compose[collapsed=true]\r\e[0KStarting the test environment..." - cp "${BLOCK_LOG_SOURCE_DIR_5M}/block_log" "${CI_PROJECT_DIR}/docker/blockchain/block_log" - cp "${BLOCK_LOG_SOURCE_DIR_5M}/block_log.artifacts" "${CI_PROJECT_DIR}/docker/blockchain/block_log.artifacts" - chmod a+w docker/blockchain/block_log - "${CI_PROJECT_DIR}/submodules/haf/scripts/copy_datadir.sh" + PGDATA_PATH="${DATADIR}/haf_db_store/pgdata" + if [[ -d "$PGDATA_PATH" ]]; then + echo "Restoring pgdata permissions to mode 700" + sudo chmod 700 "$PGDATA_PATH" + sudo chown -R 105:105 "${DATADIR}/haf_db_store" + ls -la "${DATADIR}/haf_db_store/" + fi + + rm -rf "${CI_PROJECT_DIR}/docker/blockchain"/* + cp -a "${DATADIR}/blockchain"/* "${CI_PROJECT_DIR}/docker/blockchain/" + "${CI_PROJECT_DIR}/scripts/ci-helpers/start-ci-test-environment.sh" echo -e "\e[0Ksection_end:$(date +%s):compose\r\e[0K" @@ -272,21 +335,20 @@ sync: tar -cf - $(pwd)/docker/*.log | 7z a -si -mx9 docker/container-logs.tar.7z cp -a "${SHM_DIR}" "${DATADIR}/shm_dir" - cp -a "${CI_PROJECT_DIR}/docker/blockchain/block_log" "${DATADIR}/blockchain/block_log" - cp -a "${CI_PROJECT_DIR}/docker/blockchain/block_log.artifacts" "${DATADIR}/blockchain/block_log.artifacts" - mkdir -p "${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT}_${CI_PIPELINE_ID}" - sudo cp -a "${DATADIR}" "${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT}_${CI_PIPELINE_ID}" + LOCAL_SYNC_CACHE="${DATA_CACHE_HAF_PREFIX}_${SYNC_CACHE_KEY}" + mkdir -p "${LOCAL_SYNC_CACHE}" + sudo cp -a "${DATADIR}" "${LOCAL_SYNC_CACHE}" ls -lah "${DATADIR}" - ls -lah "${DATADIR}/blockchain" - ls -lah "${DATADIR}/shm_dir" + ls -lah "${LOCAL_SYNC_CACHE}" || true - ls -lah "${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT}_${CI_PIPELINE_ID}" - ls -lah "${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT}_${CI_PIPELINE_ID}/blockchain" - ls -lah "${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT}_${CI_PIPELINE_ID}/shm_dir" + CACHE_MANAGER="${CI_PROJECT_DIR}/submodules/haf/scripts/ci-helpers/cache-manager.sh" + if [[ -x "$CACHE_MANAGER" ]]; then + echo "Pushing sync data to NFS cache: ${SYNC_CACHE_TYPE}/${SYNC_CACHE_KEY}" + "$CACHE_MANAGER" put "${SYNC_CACHE_TYPE}" "${SYNC_CACHE_KEY}" "${LOCAL_SYNC_CACHE}" || echo "Warning: Failed to push to NFS cache" + fi - # Manually remove the copy of the replay data to preserve disk space on the replay server sudo rm -rf ${CI_PROJECT_DIR}/${CI_JOB_ID} echo -e "\e[0Ksection_end:$(date +%s):compose2\r\e[0K" @@ -297,19 +359,40 @@ sync: when: always tags: - data-cache-storage - - hive-builder-9 + - fast -.hfm-only-service: &hfm-only-service - name: $HAF_IMAGE_NAME - alias: hfm-only-instance +.haf-instance-with-nfs-fallback: &haf-instance-with-nfs-fallback + name: ${HAF_IMAGE_NAME} + alias: haf-instance variables: - PGCTLTIMEOUT: 600 # give PostgreSQL more time to start if GitLab shut it down improperly after the sync job + PGCTLTIMEOUT: 600 PG_ACCESS: | "host all haf_admin 0.0.0.0/0 trust" "host all hived 0.0.0.0/0 trust" "host all hafbe_user 0.0.0.0/0 trust" "host all hafbe_owner 0.0.0.0/0 trust" "host all all 0.0.0.0/0 scram-sha-256" + DATA_SOURCE: "${DATA_CACHE_HAF_PREFIX}_${SYNC_CACHE_KEY}" + DATA_SOURCE_NFS_PREFIX: "${DATA_CACHE_NFS_PREFIX}" + DATA_SOURCE_NFS_TYPE: "${SYNC_CACHE_TYPE}" + DATA_SOURCE_NFS_KEY: "${SYNC_CACHE_KEY}" + entrypoint: + - '/bin/bash' + - '-c' + - | + set -xeuo pipefail + ORIGINAL_SOURCE="${DATA_SOURCE}" + NFS_PREFIX="${DATA_SOURCE_NFS_PREFIX:-/nfs/ci-cache}" + NFS_TYPE="${DATA_SOURCE_NFS_TYPE:-haf_sync}" + NFS_KEY="${DATA_SOURCE_NFS_KEY}" + if [[ ! -d "${ORIGINAL_SOURCE}/datadir" ]]; then + NFS_PATH="${NFS_PREFIX}/${NFS_TYPE}/${NFS_KEY}" + if [[ -d "${NFS_PATH}/datadir" ]]; then + export DATA_SOURCE="$NFS_PATH" + fi + fi + exec /home/haf_admin/docker_entrypoint.sh "$@" + - '/bin/bash' command: ["--execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh"] .postgrest-service: &postgrest-service @@ -318,8 +401,7 @@ sync: variables: PGRST_ADMIN_SERVER_PORT: 3001 PGRST_SERVER_PORT: 3000 - # Pointing to the PostgreSQL service running in hfm-only-instance - PGRST_DB_URI: postgresql://haf_admin@hfm-only-instance:5432/haf_block_log + PGRST_DB_URI: postgresql://haf_admin@haf-instance:5432/haf_block_log PGRST_DB_SCHEMA: hafbe_endpoints PGRST_DB_ANON_ROLE: hafbe_user PGRST_DB_POOL: 20 @@ -351,18 +433,16 @@ regression-test: - job: prepare_haf_image artifacts: true services: - - *hfm-only-service - variables: - DATA_SOURCE: ${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT}_${CI_PIPELINE_ID} + - *haf-instance-with-nfs-fallback script: - | echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning tests..." cd tests/account_parameters - ./accounts_dump_test.sh --host=hfm-only-instance + ./accounts_dump_test.sh --host=haf-instance cd ../witness_parameters - ./witnesses_dump_test.sh --host=hfm-only-instance + ./witnesses_dump_test.sh --host=haf-instance echo -e "\e[0Ksection_end:$(date +%s):tests\r\e[0K" artifacts: @@ -372,7 +452,7 @@ regression-test: when: always tags: - data-cache-storage - - hive-builder-9 + - fast setup-scripts-test: image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-24.0.1-5 @@ -385,20 +465,18 @@ setup-scripts-test: - job: prepare_haf_image artifacts: true services: - - *hfm-only-service - variables: - DATA_SOURCE: ${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT}_${CI_PIPELINE_ID} + - *haf-instance-with-nfs-fallback script: - | echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning tests..." cd tests/functional - ./test_scripts.sh --host=hfm-only-instance + ./test_scripts.sh --host=haf-instance echo -e "\e[0Ksection_end:$(date +%s):tests\r\e[0K" tags: - data-cache-storage - - hive-builder-9 + - fast performance-test: image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-24.0.1-5 @@ -411,15 +489,13 @@ performance-test: - job: prepare_haf_image artifacts: true services: - - *hfm-only-service + - *haf-instance-with-nfs-fallback - *postgrest-service - variables: - DATA_SOURCE: ${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT}_${CI_PIPELINE_ID} script: - | echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning tests..." - timeout -k 1m 15m ./tests/run_performance_tests.sh --postgresql-host=hfm-only-instance --postgrest-host=postgrest-server --database-size=6000 --test-loop-count=1000 + timeout -k 1m 15m ./tests/run_performance_tests.sh --postgresql-host=haf-instance --postgrest-host=postgrest-server --database-size=6000 --test-loop-count=1000 tar -cf - $(pwd)/tests/performance/result* | 7z a -si -mx9 tests/performance/results.tar.7z cat jmeter.log | python3 docker/ci/parse-jmeter-output.py m2u --input $(pwd)/tests/performance/result/result.xml --output $(pwd)/tests/performance/junit-result.xml @@ -435,7 +511,7 @@ performance-test: junit: tests/performance/junit-result.xml tags: - data-cache-storage - - hive-builder-9 + - fast pattern-test: extends: .pytest_based_template @@ -448,10 +524,9 @@ pattern-test: - job: prepare_haf_image artifacts: true services: - - *hfm-only-service + - *haf-instance-with-nfs-fallback - *postgrest-service variables: - DATA_SOURCE: ${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT}_${CI_PIPELINE_ID} JUNIT_REPORT: $CI_PROJECT_DIR/tests/tavern/report.xml PYTEST_BASED_IMAGE_NAME: $BUILDER_IMAGE_PATH POETRY_INSTALL_ROOT_DIR: $CI_PROJECT_DIR/submodules/haf/hive/tests/python/hive-local-tools @@ -467,7 +542,7 @@ pattern-test: - "**/*.out.json" tags: - data-cache-storage - - hive-builder-9 + - fast build_and_publish_image: stage: publish @@ -535,4 +610,4 @@ cleanup_haf_cache_manual: CLEANUP_PATH_PATTERN: "${DATA_CACHE_HAF_PREFIX}_*" tags: - data-cache-storage - - hive-builder-9 + - fast diff --git a/.gitmodules b/.gitmodules index 8b3eeb2d..d6b8aaf7 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,6 +7,7 @@ [submodule "submodules/haf"] path = submodules/haf url = ../haf.git + branch = feature/nfs-cache-manager [submodule "submodules/reptracker"] path = submodules/reptracker url = ../reputation_tracker.git diff --git a/docker/overrides/ci.yml b/docker/overrides/ci.yml new file mode 100644 index 00000000..d6729bde --- /dev/null +++ b/docker/overrides/ci.yml @@ -0,0 +1,33 @@ +services: + haf: + environment: + PGCTLTIMEOUT: 600 + PG_ACCESS: " + host all all all trust\n + " + # HAF must replay from blockchain since prepare_haf_data only caches + # Hive data (blockchain, shared_memory.bin), not PostgreSQL database. + # Same as reputation_tracker - replay every time in sync job. + HAF_COMMAND: "--shared-file-size=1G --plugin database_api --replay --stop-at-block=5000000" + ports: + - 5432:5432 + # Override base docker-compose.yml volumes to remove ./blockchain bind mount + # In CI, block_log is already in the datadir from cache, don't override it + volumes: + - haf_datadir:/home/hived/datadir + - haf_shmdir:/home/hived/shm_dir + - ./scripts/haf-healthcheck.sh:/home/hived/healthcheck.sh + +volumes: + haf_datadir: + driver: local + driver_opts: + o: bind + type: none + device: ${HAF_DATA_DIRECTORY}/ + haf_shmdir: + driver: local + driver_opts: + o: bind + type: none + device: ${HAF_SHM_DIRECTORY}/ diff --git a/submodules/haf b/submodules/haf index bf820442..2c55a49a 160000 --- a/submodules/haf +++ b/submodules/haf @@ -1 +1 @@ -Subproject commit bf820442979eff6c7cb7e387f26cd4ccf9345f3c +Subproject commit 2c55a49ab1661a184a8bd6e9cf1434d88209a5b8 -- GitLab From ae50b0b0d9457bb728f6f7c8b8435ceefcc8d10d Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 9 Dec 2025 01:01:13 -0500 Subject: [PATCH 002/108] Update HAF to df653a09 with pgdata permissions fix --- .gitlab-ci.yml | 4 ++-- submodules/haf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5c4bf018..c52ebf7b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -27,12 +27,12 @@ variables: BUILDER_IMAGE_PATH: "registry.gitlab.syncad.com/hive/haf/ci-base-image${BUILDER_IMAGE_TAG}" # HAF submodule commit - must match the 'ref:' in the include section below # This is needed for service containers which can't access dotenv artifacts - HAF_COMMIT: "2c55a49ab1661a184a8bd6e9cf1434d88209a5b8" + HAF_COMMIT: "df653a09a8f16c8f96a852c40443635f4c93887d" include: - template: Workflows/Branch-Pipelines.gitlab-ci.yml - project: hive/haf - ref: 2c55a49ab1661a184a8bd6e9cf1434d88209a5b8 # feature/nfs-cache-manager + ref: df653a09a8f16c8f96a852c40443635f4c93887d # feature/nfs-cache-manager file: /scripts/ci-helpers/prepare_data_image_job.yml # implicitly pulls templates/base.gitlab-ci.yml from common-ci-configuration # Do not include common-ci-configuration here, it is already referenced by scripts/ci-helpers/prepare_data_image_job.yml included from Haf/Hive repos diff --git a/submodules/haf b/submodules/haf index 2c55a49a..df653a09 160000 --- a/submodules/haf +++ b/submodules/haf @@ -1 +1 @@ -Subproject commit 2c55a49ab1661a184a8bd6e9cf1434d88209a5b8 +Subproject commit df653a09a8f16c8f96a852c40443635f4c93887d -- GitLab From 2967e14e111e0aaa151ed53eb88406d055619bce Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 9 Dec 2025 01:34:33 -0500 Subject: [PATCH 003/108] Update HAF to e842bc87 (NFS-safe locking) --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c52ebf7b..a7b064ff 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -27,12 +27,12 @@ variables: BUILDER_IMAGE_PATH: "registry.gitlab.syncad.com/hive/haf/ci-base-image${BUILDER_IMAGE_TAG}" # HAF submodule commit - must match the 'ref:' in the include section below # This is needed for service containers which can't access dotenv artifacts - HAF_COMMIT: "df653a09a8f16c8f96a852c40443635f4c93887d" + HAF_COMMIT: "e842bc8773b4415077cb0fae028de9a03eb228ba" include: - template: Workflows/Branch-Pipelines.gitlab-ci.yml - project: hive/haf - ref: df653a09a8f16c8f96a852c40443635f4c93887d # feature/nfs-cache-manager + ref: e842bc8773b4415077cb0fae028de9a03eb228ba # feature/nfs-cache-manager file: /scripts/ci-helpers/prepare_data_image_job.yml # implicitly pulls templates/base.gitlab-ci.yml from common-ci-configuration # Do not include common-ci-configuration here, it is already referenced by scripts/ci-helpers/prepare_data_image_job.yml included from Haf/Hive repos -- GitLab From 2cced5b47541380678e8a40f263c333cb3486264 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 9 Dec 2025 02:16:58 -0500 Subject: [PATCH 004/108] Update HAF to 642330a5 (tablespace permissions fix) --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a7b064ff..1f6b23b6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -27,12 +27,12 @@ variables: BUILDER_IMAGE_PATH: "registry.gitlab.syncad.com/hive/haf/ci-base-image${BUILDER_IMAGE_TAG}" # HAF submodule commit - must match the 'ref:' in the include section below # This is needed for service containers which can't access dotenv artifacts - HAF_COMMIT: "e842bc8773b4415077cb0fae028de9a03eb228ba" + HAF_COMMIT: "642330a54b44e7605423134e494eeaa40766399e" include: - template: Workflows/Branch-Pipelines.gitlab-ci.yml - project: hive/haf - ref: e842bc8773b4415077cb0fae028de9a03eb228ba # feature/nfs-cache-manager + ref: 642330a54b44e7605423134e494eeaa40766399e # feature/nfs-cache-manager file: /scripts/ci-helpers/prepare_data_image_job.yml # implicitly pulls templates/base.gitlab-ci.yml from common-ci-configuration # Do not include common-ci-configuration here, it is already referenced by scripts/ci-helpers/prepare_data_image_job.yml included from Haf/Hive repos -- GitLab From 976ba563d8b82c5ee45e74109b80eaedf267e630 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 9 Dec 2025 02:33:06 -0500 Subject: [PATCH 005/108] Fix: Update HAF_COMMIT to match submodule HEAD --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1f6b23b6..c52ebf7b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -27,12 +27,12 @@ variables: BUILDER_IMAGE_PATH: "registry.gitlab.syncad.com/hive/haf/ci-base-image${BUILDER_IMAGE_TAG}" # HAF submodule commit - must match the 'ref:' in the include section below # This is needed for service containers which can't access dotenv artifacts - HAF_COMMIT: "642330a54b44e7605423134e494eeaa40766399e" + HAF_COMMIT: "df653a09a8f16c8f96a852c40443635f4c93887d" include: - template: Workflows/Branch-Pipelines.gitlab-ci.yml - project: hive/haf - ref: 642330a54b44e7605423134e494eeaa40766399e # feature/nfs-cache-manager + ref: df653a09a8f16c8f96a852c40443635f4c93887d # feature/nfs-cache-manager file: /scripts/ci-helpers/prepare_data_image_job.yml # implicitly pulls templates/base.gitlab-ci.yml from common-ci-configuration # Do not include common-ci-configuration here, it is already referenced by scripts/ci-helpers/prepare_data_image_job.yml included from Haf/Hive repos -- GitLab From ab76b903e8b796dab61dc5901e68784bf8e6c4bf Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 9 Dec 2025 04:07:41 -0500 Subject: [PATCH 006/108] Fix: Update HAF to e7d9a8ac (local cache check before NFS fetch) Addresses pipeline timeout issue where cache-manager.sh would attempt slow NFS tar extraction even when local cache already exists on the builder. The HAF update adds a local cache check first, preventing unnecessary NFS operations. --- .gitlab-ci.yml | 4 ++-- submodules/haf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c52ebf7b..692b40f6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -27,12 +27,12 @@ variables: BUILDER_IMAGE_PATH: "registry.gitlab.syncad.com/hive/haf/ci-base-image${BUILDER_IMAGE_TAG}" # HAF submodule commit - must match the 'ref:' in the include section below # This is needed for service containers which can't access dotenv artifacts - HAF_COMMIT: "df653a09a8f16c8f96a852c40443635f4c93887d" + HAF_COMMIT: "e7d9a8ac5883188f77f8d39cdfa7e7f6248a84a4" include: - template: Workflows/Branch-Pipelines.gitlab-ci.yml - project: hive/haf - ref: df653a09a8f16c8f96a852c40443635f4c93887d # feature/nfs-cache-manager + ref: e7d9a8ac5883188f77f8d39cdfa7e7f6248a84a4 # feature/nfs-cache-manager file: /scripts/ci-helpers/prepare_data_image_job.yml # implicitly pulls templates/base.gitlab-ci.yml from common-ci-configuration # Do not include common-ci-configuration here, it is already referenced by scripts/ci-helpers/prepare_data_image_job.yml included from Haf/Hive repos diff --git a/submodules/haf b/submodules/haf index df653a09..e7d9a8ac 160000 --- a/submodules/haf +++ b/submodules/haf @@ -1 +1 @@ -Subproject commit df653a09a8f16c8f96a852c40443635f4c93887d +Subproject commit e7d9a8ac5883188f77f8d39cdfa7e7f6248a84a4 -- GitLab From e7d62cac448fe4ff6173abe0edfb9153525958fd Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 9 Dec 2025 15:03:40 -0500 Subject: [PATCH 007/108] Update HAF to 465036e7 (fix pgdata ownership check) --- .gitlab-ci.yml | 4 ++-- submodules/haf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 692b40f6..eb567a69 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -27,12 +27,12 @@ variables: BUILDER_IMAGE_PATH: "registry.gitlab.syncad.com/hive/haf/ci-base-image${BUILDER_IMAGE_TAG}" # HAF submodule commit - must match the 'ref:' in the include section below # This is needed for service containers which can't access dotenv artifacts - HAF_COMMIT: "e7d9a8ac5883188f77f8d39cdfa7e7f6248a84a4" + HAF_COMMIT: "465036e77ceb78cd71d1daeb377b6d0c0a21b857" include: - template: Workflows/Branch-Pipelines.gitlab-ci.yml - project: hive/haf - ref: e7d9a8ac5883188f77f8d39cdfa7e7f6248a84a4 # feature/nfs-cache-manager + ref: 465036e77ceb78cd71d1daeb377b6d0c0a21b857 # feature/nfs-cache-manager file: /scripts/ci-helpers/prepare_data_image_job.yml # implicitly pulls templates/base.gitlab-ci.yml from common-ci-configuration # Do not include common-ci-configuration here, it is already referenced by scripts/ci-helpers/prepare_data_image_job.yml included from Haf/Hive repos diff --git a/submodules/haf b/submodules/haf index e7d9a8ac..465036e7 160000 --- a/submodules/haf +++ b/submodules/haf @@ -1 +1 @@ -Subproject commit e7d9a8ac5883188f77f8d39cdfa7e7f6248a84a4 +Subproject commit 465036e77ceb78cd71d1daeb377b6d0c0a21b857 -- GitLab From e21c7264d9c2ea4edd25e288e87b25e2b48a3096 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 9 Dec 2025 15:43:56 -0500 Subject: [PATCH 008/108] Fix: Dereference symlinks when copying blockchain to docker --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index eb567a69..fda0ef89 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -306,7 +306,7 @@ sync: fi rm -rf "${CI_PROJECT_DIR}/docker/blockchain"/* - cp -a "${DATADIR}/blockchain"/* "${CI_PROJECT_DIR}/docker/blockchain/" + cp -a -L "${DATADIR}/blockchain"/* "${CI_PROJECT_DIR}/docker/blockchain/" "${CI_PROJECT_DIR}/scripts/ci-helpers/start-ci-test-environment.sh" -- GitLab From 5e27790514f6d8cbdaa77c857910424ad788d156 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 9 Dec 2025 16:10:27 -0500 Subject: [PATCH 009/108] Fix: Restore pgdata permissions after cache-manager put --- .gitlab-ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index fda0ef89..ee8e41de 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -347,6 +347,12 @@ sync: if [[ -x "$CACHE_MANAGER" ]]; then echo "Pushing sync data to NFS cache: ${SYNC_CACHE_TYPE}/${SYNC_CACHE_KEY}" "$CACHE_MANAGER" put "${SYNC_CACHE_TYPE}" "${SYNC_CACHE_KEY}" "${LOCAL_SYNC_CACHE}" || echo "Warning: Failed to push to NFS cache" + # Restore pgdata permissions on local cache (cache-manager put relaxes them for NFS copy) + PGDATA_PATH="${LOCAL_SYNC_CACHE}/datadir/haf_db_store/pgdata" + if [[ -d "$PGDATA_PATH" ]]; then + echo "Restoring local cache pgdata permissions to mode 700" + sudo chmod 700 "$PGDATA_PATH" + fi fi sudo rm -rf ${CI_PROJECT_DIR}/${CI_JOB_ID} -- GitLab From 48cbfbb904cf67db3c41189d80beeedcdd97bd8b Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 10 Dec 2025 16:15:39 -0500 Subject: [PATCH 010/108] Fix: add NFS tar extraction with pgdata permission fixes for HAF service --- .gitlab-ci.yml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ee8e41de..df91a6be 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -387,16 +387,47 @@ sync: - '-c' - | set -xeuo pipefail + echo "Checking data source availability..." ORIGINAL_SOURCE="${DATA_SOURCE}" NFS_PREFIX="${DATA_SOURCE_NFS_PREFIX:-/nfs/ci-cache}" NFS_TYPE="${DATA_SOURCE_NFS_TYPE:-haf_sync}" NFS_KEY="${DATA_SOURCE_NFS_KEY}" + + # If original path doesn't exist, try NFS fallback if [[ ! -d "${ORIGINAL_SOURCE}/datadir" ]]; then + echo "Local cache not found: ${ORIGINAL_SOURCE}/datadir" NFS_PATH="${NFS_PREFIX}/${NFS_TYPE}/${NFS_KEY}" + NFS_TAR="${NFS_PATH}.tar" + if [[ -d "${NFS_PATH}/datadir" ]]; then + echo "Found data on NFS directory: $NFS_PATH" export DATA_SOURCE="$NFS_PATH" + elif [[ -f "${NFS_TAR}" ]]; then + echo "Found NFS tar archive: $NFS_TAR" + echo "Extracting to: ${ORIGINAL_SOURCE}" + mkdir -p "${ORIGINAL_SOURCE}" + tar xf "${NFS_TAR}" -C "${ORIGINAL_SOURCE}" + # Restore pgdata ownership and permissions for PostgreSQL + # PostgreSQL requires pgdata to be owned by postgres (UID 105) with mode 700 + if [[ -d "${ORIGINAL_SOURCE}/datadir/haf_db_store" ]]; then + echo "Fixing PostgreSQL data permissions..." + sudo chown -R 105:109 "${ORIGINAL_SOURCE}/datadir/haf_db_store" + sudo chown -R 105:109 "${ORIGINAL_SOURCE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true + sudo chmod 700 "${ORIGINAL_SOURCE}/datadir/haf_db_store/pgdata" + fi + echo "Extracted NFS cache successfully" + export DATA_SOURCE="${ORIGINAL_SOURCE}" + else + echo "WARNING: Data not found in local or NFS cache" + echo "Checked: ${ORIGINAL_SOURCE}/datadir" + echo "Checked: ${NFS_PATH}/datadir" + echo "Checked: ${NFS_TAR}" fi + else + echo "Using local cache: ${ORIGINAL_SOURCE}" fi + + # Run original entrypoint exec /home/haf_admin/docker_entrypoint.sh "$@" - '/bin/bash' command: ["--execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh"] -- GitLab From a00562639f930976f026c959247f97b65e2a3727 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Thu, 18 Dec 2025 13:09:51 -0500 Subject: [PATCH 011/108] Update HAF submodule with CI-specific postgres config - Update HAF to 48407d1a5 (develop) - Add HAF_CI_MODE=1 to enable reduced memory postgres config - Add CACHE_HANDLING=haf to cache-manager calls for proper pgdata handling - Rename cache type to hafbe_sync (project-specific) - This allows multiple concurrent jobs on 64GB builders without swapping --- .gitlab-ci.yml | 45 ++++++++++++++++++++++++++++++--------------- submodules/haf | 2 +- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index df91a6be..cf285ba4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -17,8 +17,8 @@ variables: DATA_CACHE_HAF_PREFIX: "/cache/replay_data_haf" # NFS cache configuration for sync data sharing across builders DATA_CACHE_NFS_PREFIX: "/nfs/ci-cache" - SYNC_CACHE_KEY: "${HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}" - SYNC_CACHE_TYPE: "haf_sync" + HAFBE_CACHE_KEY: "${HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}" + HAFBE_SYNC_CACHE_TYPE: "hafbe_sync" BLOCK_LOG_SOURCE_DIR_5M: /blockchain/block_log_5m FF_NETWORK_PER_BUILD: 1 PYTEST_NUMBER_OF_PROCESSES: 8 @@ -27,12 +27,14 @@ variables: BUILDER_IMAGE_PATH: "registry.gitlab.syncad.com/hive/haf/ci-base-image${BUILDER_IMAGE_TAG}" # HAF submodule commit - must match the 'ref:' in the include section below # This is needed for service containers which can't access dotenv artifacts - HAF_COMMIT: "465036e77ceb78cd71d1daeb377b6d0c0a21b857" + HAF_COMMIT: "48407d1a5c08bef22d4cab7262e96e4202b8d99a" + # Enable CI-specific PostgreSQL config with reduced memory for HAF service containers + HAF_CI_MODE: "1" include: - template: Workflows/Branch-Pipelines.gitlab-ci.yml - project: hive/haf - ref: 465036e77ceb78cd71d1daeb377b6d0c0a21b857 # feature/nfs-cache-manager + ref: 48407d1a5c08bef22d4cab7262e96e4202b8d99a # develop file: /scripts/ci-helpers/prepare_data_image_job.yml # implicitly pulls templates/base.gitlab-ci.yml from common-ci-configuration # Do not include common-ci-configuration here, it is already referenced by scripts/ci-helpers/prepare_data_image_job.yml included from Haf/Hive repos @@ -280,7 +282,7 @@ sync: echo "Local HAF cache not found, checking NFS..." CACHE_MANAGER="${CI_PROJECT_DIR}/submodules/haf/scripts/ci-helpers/cache-manager.sh" if [[ -x "$CACHE_MANAGER" ]]; then - if "$CACHE_MANAGER" get haf "${HAF_COMMIT}" "${LOCAL_HAF_CACHE}"; then + if CACHE_HANDLING=haf "$CACHE_MANAGER" get haf "${HAF_COMMIT}" "${LOCAL_HAF_CACHE}"; then echo "Fetched HAF replay data from NFS cache" else echo "ERROR: Failed to fetch HAF replay data from NFS cache" @@ -336,19 +338,19 @@ sync: tar -cf - $(pwd)/docker/*.log | 7z a -si -mx9 docker/container-logs.tar.7z cp -a "${SHM_DIR}" "${DATADIR}/shm_dir" - LOCAL_SYNC_CACHE="${DATA_CACHE_HAF_PREFIX}_${SYNC_CACHE_KEY}" - mkdir -p "${LOCAL_SYNC_CACHE}" - sudo cp -a "${DATADIR}" "${LOCAL_SYNC_CACHE}" + LOCAL_HAFBE_CACHE="${DATA_CACHE_HAF_PREFIX}_${HAFBE_CACHE_KEY}" + mkdir -p "${LOCAL_HAFBE_CACHE}" + sudo cp -a "${DATADIR}" "${LOCAL_HAFBE_CACHE}" ls -lah "${DATADIR}" - ls -lah "${LOCAL_SYNC_CACHE}" || true + ls -lah "${LOCAL_HAFBE_CACHE}" || true CACHE_MANAGER="${CI_PROJECT_DIR}/submodules/haf/scripts/ci-helpers/cache-manager.sh" if [[ -x "$CACHE_MANAGER" ]]; then - echo "Pushing sync data to NFS cache: ${SYNC_CACHE_TYPE}/${SYNC_CACHE_KEY}" - "$CACHE_MANAGER" put "${SYNC_CACHE_TYPE}" "${SYNC_CACHE_KEY}" "${LOCAL_SYNC_CACHE}" || echo "Warning: Failed to push to NFS cache" + echo "Pushing sync data to NFS cache: ${HAFBE_SYNC_CACHE_TYPE}/${HAFBE_CACHE_KEY}" + CACHE_HANDLING=haf "$CACHE_MANAGER" put "${HAFBE_SYNC_CACHE_TYPE}" "${HAFBE_CACHE_KEY}" "${LOCAL_HAFBE_CACHE}" || echo "Warning: Failed to push to NFS cache" # Restore pgdata permissions on local cache (cache-manager put relaxes them for NFS copy) - PGDATA_PATH="${LOCAL_SYNC_CACHE}/datadir/haf_db_store/pgdata" + PGDATA_PATH="${LOCAL_HAFBE_CACHE}/datadir/haf_db_store/pgdata" if [[ -d "$PGDATA_PATH" ]]; then echo "Restoring local cache pgdata permissions to mode 700" sudo chmod 700 "$PGDATA_PATH" @@ -378,10 +380,10 @@ sync: "host all hafbe_user 0.0.0.0/0 trust" "host all hafbe_owner 0.0.0.0/0 trust" "host all all 0.0.0.0/0 scram-sha-256" - DATA_SOURCE: "${DATA_CACHE_HAF_PREFIX}_${SYNC_CACHE_KEY}" + DATA_SOURCE: "${DATA_CACHE_HAF_PREFIX}_${HAFBE_CACHE_KEY}" DATA_SOURCE_NFS_PREFIX: "${DATA_CACHE_NFS_PREFIX}" - DATA_SOURCE_NFS_TYPE: "${SYNC_CACHE_TYPE}" - DATA_SOURCE_NFS_KEY: "${SYNC_CACHE_KEY}" + DATA_SOURCE_NFS_TYPE: "${HAFBE_SYNC_CACHE_TYPE}" + DATA_SOURCE_NFS_KEY: "${HAFBE_CACHE_KEY}" entrypoint: - '/bin/bash' - '-c' @@ -460,6 +462,7 @@ python_api_client_test: - public-runner-docker regression-test: + extends: .wait-for-haf-postgres image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-24.0.1-5 stage: test needs: @@ -469,6 +472,8 @@ regression-test: artifacts: true - job: prepare_haf_image artifacts: true + variables: + HAF_APP_SCHEMA: "hafbe_app" services: - *haf-instance-with-nfs-fallback script: @@ -492,6 +497,7 @@ regression-test: - fast setup-scripts-test: + extends: .wait-for-haf-postgres image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-24.0.1-5 stage: test needs: @@ -501,6 +507,8 @@ setup-scripts-test: artifacts: true - job: prepare_haf_image artifacts: true + variables: + HAF_APP_SCHEMA: "hafbe_app" services: - *haf-instance-with-nfs-fallback script: @@ -516,6 +524,7 @@ setup-scripts-test: - fast performance-test: + extends: .wait-for-haf-postgres image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-24.0.1-5 stage: test needs: @@ -525,6 +534,8 @@ performance-test: artifacts: true - job: prepare_haf_image artifacts: true + variables: + HAF_APP_SCHEMA: "hafbe_app" services: - *haf-instance-with-nfs-fallback - *postgrest-service @@ -570,6 +581,10 @@ pattern-test: HAFBE_ADDRESS: postgrest-server HAFBE_PORT: 3000 TAVERN_DIR: $CI_PROJECT_DIR/tests/tavern + HAF_APP_SCHEMA: "hafbe_app" + before_script: + - !reference [.pytest_based_template, before_script] + - !reference [.wait-for-haf-postgres, before_script] script: - | cd $CI_PROJECT_DIR/tests/tavern diff --git a/submodules/haf b/submodules/haf index 465036e7..48407d1a 160000 --- a/submodules/haf +++ b/submodules/haf @@ -1 +1 @@ -Subproject commit 465036e77ceb78cd71d1daeb377b6d0c0a21b857 +Subproject commit 48407d1a5c08bef22d4cab7262e96e4202b8d99a -- GitLab From 2977ca6642f83ae515ac3f437f4f12fc1330689a Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Mon, 29 Dec 2025 22:56:52 -0500 Subject: [PATCH 012/108] Update to latest HAF develop and common-ci-configuration images - Update HAF submodule to b4225f9d (latest develop) - Rebase ci-runner on common-ci-configuration/docker-builder:latest - Update Dockerfile syntax to dockerfile:1.11 - Update postgrest service to common-ci-configuration/postgrest:v12.0.2 - Update nginx image to common-ci-configuration/nginx:latest - Add QUICK_TEST mode for faster CI iterations - Add docs-only change detection to skip unnecessary jobs --- .gitlab-ci.yml | 206 +++++++++++++++++++++------ Dockerfile | 2 +- Dockerfile.rewriter | 2 +- docker-bake.hcl | 2 +- docker/ci/Dockerfile | 17 ++- scripts/ci-helpers/skip_rules.yml | 225 ++++++++++++++++++++++++++++++ submodules/haf | 2 +- 7 files changed, 407 insertions(+), 49 deletions(-) create mode 100644 scripts/ci-helpers/skip_rules.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index cf285ba4..951343cd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -27,16 +27,24 @@ variables: BUILDER_IMAGE_PATH: "registry.gitlab.syncad.com/hive/haf/ci-base-image${BUILDER_IMAGE_TAG}" # HAF submodule commit - must match the 'ref:' in the include section below # This is needed for service containers which can't access dotenv artifacts - HAF_COMMIT: "48407d1a5c08bef22d4cab7262e96e4202b8d99a" + HAF_COMMIT: "b4225f9d2591195b0e6aadf36bbef921d95f92b9" # Enable CI-specific PostgreSQL config with reduced memory for HAF service containers HAF_CI_MODE: "1" + # Quick Test Mode - uses cached HAF data from previous pipeline, skips HAF rebuild/replay + # Usage: Set QUICK_TEST=true and QUICK_TEST_HAF_COMMIT to a commit with cached data + # Find available cache keys: ssh hive-builder-10 'ls -lt /nfs/ci-cache/haf/*.tar | head -5' + QUICK_TEST: "false" + QUICK_TEST_HAF_COMMIT: "" + include: - template: Workflows/Branch-Pipelines.gitlab-ci.yml - project: hive/haf - ref: 48407d1a5c08bef22d4cab7262e96e4202b8d99a # develop + ref: b4225f9d2591195b0e6aadf36bbef921d95f92b9 # develop file: /scripts/ci-helpers/prepare_data_image_job.yml # implicitly pulls templates/base.gitlab-ci.yml from common-ci-configuration # Do not include common-ci-configuration here, it is already referenced by scripts/ci-helpers/prepare_data_image_job.yml included from Haf/Hive repos +# Skip rules for docs-only changes and QUICK_TEST mode +- local: '/scripts/ci-helpers/skip_rules.yml' .lint_job: extends: .job-defaults @@ -77,6 +85,36 @@ lint_sql_scripts: paths: - sql-lint.yaml +# Quick Test Mode Setup - overrides HAF_COMMIT with cached version +quick_test_setup: + stage: lint + image: alpine:latest + needs: [] + script: + - | + echo "Quick Test Mode enabled" + echo "Using cached HAF data from commit: $QUICK_TEST_HAF_COMMIT" + + if [ -z "$QUICK_TEST_HAF_COMMIT" ]; then + echo "ERROR: QUICK_TEST_HAF_COMMIT must be set when QUICK_TEST=true" + echo "Find available cache keys: ssh hive-builder-10 'ls -lt /nfs/ci-cache/haf/*.tar | head -5'" + exit 1 + fi + + # Override HAF_COMMIT with the cached version + echo "HAF_COMMIT=$QUICK_TEST_HAF_COMMIT" > quick_test.env + echo "HAFBE_CACHE_KEY=${QUICK_TEST_HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}" >> quick_test.env + cat quick_test.env + artifacts: + reports: + dotenv: quick_test.env + rules: + - if: $QUICK_TEST == "true" + when: always + - when: never + tags: + - public-runner-docker + validate_haf_commit: stage: build image: alpine:latest @@ -102,12 +140,25 @@ validate_haf_commit: exit 1 fi echo "All HAF commit references are consistent" + rules: + # Skip in QUICK_TEST mode - we're using cached data from a different commit + - if: $QUICK_TEST == "true" + when: never + # Skip for docs-only changes + - if: $DOCS_ONLY == "true" + when: never + - when: on_success tags: - public-runner-docker prepare_haf_image: stage: build - extends: .prepare_haf_image + extends: + - .prepare_haf_image + - .skip_on_docs_only_or_quick_test + needs: + - job: detect_changes + optional: true # Not needed on protected branches variables: SUBMODULE_DIR: "$CI_PROJECT_DIR/submodules/haf" REGISTRY_USER: "$HAF_DEPLOY_USERNAME" @@ -120,10 +171,15 @@ prepare_haf_image: - hived prepare_haf_data: - extends: .prepare_haf_data_5m + extends: + - .prepare_haf_data_5m + - .skip_on_docs_only_or_quick_test needs: - - job: prepare_haf_image - artifacts: true + - job: detect_changes + optional: true + - job: prepare_haf_image + artifacts: true + optional: true # Not needed if build was skipped stage: build timeout: 80m variables: @@ -178,14 +234,22 @@ prepare_haf_data: docker-ci-runner-build: extends: .docker-base-build-template + rules: + - if: $DOCS_ONLY == "true" + when: never + - when: on_success variables: BASE_REPO_NAME: "" - BASE_TAG: "docker-24.0.1-5" + BASE_TAG: "docker-26.1.4-1" NAME: "ci-runner" TARGET: "ci-runner-ci" docker-setup-docker-image-build: extends: .docker-base-build-template + rules: + - if: $DOCS_ONLY == "true" + when: never + - when: on_success variables: GIT_SUBMODULE_STRATEGY: none GIT_DEPTH: 1 @@ -197,6 +261,10 @@ docker-setup-docker-image-build: extract-swagger-json: extends: .filter_out_swagger_json stage: build + rules: + - if: $DOCS_ONLY == "true" + when: never + - when: on_success variables: INPUT_SQL_SWAGGER_FILE: "${CI_PROJECT_DIR}/endpoints/endpoint_schema.sql" tags: @@ -205,6 +273,10 @@ extract-swagger-json: generate-wax-spec: extends: .generate_swagger_package stage: build + rules: + - if: $DOCS_ONLY == "true" + when: never + - when: on_success variables: INPUT_JSON_SWAGGER_FILE: "${BUILT_JSON_SWAGGER_FILE}" API_TYPE: "rest" @@ -219,6 +291,10 @@ generate-wax-spec: generate_python_api_client: extends: .project_develop_configuration_template stage: build + rules: + - if: $DOCS_ONLY == "true" + when: never + - when: on_success variables: PYPROJECT_DIR: "${CI_PROJECT_DIR}/scripts/python_api_package" needs: @@ -236,6 +312,10 @@ generate_python_api_client: build_python_api_client_wheel: extends: .build_wheel_template stage: build + rules: + - if: $DOCS_ONLY == "true" + when: never + - when: on_success needs: - job: generate_python_api_client artifacts: true @@ -247,12 +327,22 @@ build_python_api_client_wheel: sync: extends: .docker_image_builder_job_template stage: sync - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-24.0.1-5 + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 needs: - - prepare_haf_image - - prepare_haf_data - - docker-setup-docker-image-build - - docker-ci-runner-build + - job: quick_test_setup + artifacts: true + optional: true # Only runs in QUICK_TEST mode + - job: prepare_haf_image + optional: true # Skipped in QUICK_TEST mode + - job: prepare_haf_data + optional: true # Skipped in QUICK_TEST mode + - job: docker-setup-docker-image-build + - job: docker-ci-runner-build + rules: + # Skip for docs-only changes + - if: $DOCS_ONLY == "true" + when: never + - when: on_success variables: DATA_SOURCE: ${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT} DATADIR: ${CI_PROJECT_DIR}/${CI_JOB_ID}/datadir @@ -435,7 +525,7 @@ sync: command: ["--execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh"] .postgrest-service: &postgrest-service - name: registry.gitlab.syncad.com/hive/haf_api_node/postgrest:latest + name: registry.gitlab.syncad.com/hive/common-ci-configuration/postgrest:v12.0.2 alias: postgrest-server variables: PGRST_ADMIN_SERVER_PORT: 3001 @@ -451,6 +541,10 @@ sync: python_api_client_test: extends: .project_develop_configuration_template stage: test + rules: + - if: $DOCS_ONLY == "true" + when: never + - when: on_success needs: - job: generate_python_api_client artifacts: true @@ -463,15 +557,23 @@ python_api_client_test: regression-test: extends: .wait-for-haf-postgres - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-24.0.1-5 + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 stage: test needs: - - job: sync - artifacts: true - - job: docker-setup-docker-image-build - artifacts: true - - job: prepare_haf_image - artifacts: true + - job: quick_test_setup + artifacts: true + optional: true + - job: sync + artifacts: true + - job: docker-setup-docker-image-build + artifacts: true + - job: prepare_haf_image + artifacts: true + optional: true # Not needed in QUICK_TEST mode + rules: + - if: $DOCS_ONLY == "true" + when: never + - when: on_success variables: HAF_APP_SCHEMA: "hafbe_app" services: @@ -498,15 +600,23 @@ regression-test: setup-scripts-test: extends: .wait-for-haf-postgres - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-24.0.1-5 + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 stage: test needs: - - job: sync - artifacts: true - - job: docker-setup-docker-image-build - artifacts: true - - job: prepare_haf_image - artifacts: true + - job: quick_test_setup + artifacts: true + optional: true + - job: sync + artifacts: true + - job: docker-setup-docker-image-build + artifacts: true + - job: prepare_haf_image + artifacts: true + optional: true + rules: + - if: $DOCS_ONLY == "true" + when: never + - when: on_success variables: HAF_APP_SCHEMA: "hafbe_app" services: @@ -525,15 +635,23 @@ setup-scripts-test: performance-test: extends: .wait-for-haf-postgres - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-24.0.1-5 + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 stage: test needs: - - job: sync - artifacts: true - - job: docker-setup-docker-image-build - artifacts: true - - job: prepare_haf_image - artifacts: true + - job: quick_test_setup + artifacts: true + optional: true + - job: sync + artifacts: true + - job: docker-setup-docker-image-build + artifacts: true + - job: prepare_haf_image + artifacts: true + optional: true + rules: + - if: $DOCS_ONLY == "true" + when: never + - when: on_success variables: HAF_APP_SCHEMA: "hafbe_app" services: @@ -565,12 +683,20 @@ pattern-test: extends: .pytest_based_template stage: test needs: - - job: sync - artifacts: true - - job: docker-setup-docker-image-build - artifacts: true - - job: prepare_haf_image - artifacts: true + - job: quick_test_setup + artifacts: true + optional: true + - job: sync + artifacts: true + - job: docker-setup-docker-image-build + artifacts: true + - job: prepare_haf_image + artifacts: true + optional: true + rules: + - if: $DOCS_ONLY == "true" + when: never + - when: on_success services: - *haf-instance-with-nfs-fallback - *postgrest-service diff --git a/Dockerfile b/Dockerfile index d46b0f80..36c939ca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -# syntax=registry.gitlab.syncad.com/hive/common-ci-configuration/dockerfile:1.5 +# syntax=registry.gitlab.syncad.com/hive/common-ci-configuration/dockerfile:1.11 ARG PSQL_CLIENT_VERSION=14-1 FROM registry.gitlab.syncad.com/hive/common-ci-configuration/psql:$PSQL_CLIENT_VERSION AS psql diff --git a/Dockerfile.rewriter b/Dockerfile.rewriter index 3356cd60..dbb35bfb 100644 --- a/Dockerfile.rewriter +++ b/Dockerfile.rewriter @@ -1,4 +1,4 @@ -FROM registry.gitlab.syncad.com/hive/common-ci-configuration/nginx:ecd325dd43aee24562f59195ef51a20fa15514d4 AS without_tag +FROM registry.gitlab.syncad.com/hive/common-ci-configuration/nginx:latest AS without_tag COPY docker/haf_block_explorer_nginx.conf.template /usr/local/openresty/nginx/conf/nginx.conf.template COPY rewrite_rules.conf /usr/local/openresty/nginx/conf/rewrite_rules.conf diff --git a/docker-bake.hcl b/docker-bake.hcl index 81b39c22..b4c3e34d 100644 --- a/docker-bake.hcl +++ b/docker-bake.hcl @@ -15,7 +15,7 @@ variable "TAG" { default = "latest" } variable "TAG_CI" { - default = "docker-24.0.1-5" + default = "docker-26.1.4-1" } variable "PSQL_CLIENT_VERSION" { default = "14-1" diff --git a/docker/ci/Dockerfile b/docker/ci/Dockerfile index ee5c7ad5..05ee7ce2 100644 --- a/docker/ci/Dockerfile +++ b/docker/ci/Dockerfile @@ -1,11 +1,18 @@ -# syntax=registry.gitlab.syncad.com/hive/common-ci-configuration/dockerfile:1.5 -# THe lastest CI runner image from balance_tracker repository -FROM registry.gitlab.syncad.com/hive/balance_tracker/ci-runner:docker-24.0.1-10 +# syntax=registry.gitlab.syncad.com/hive/common-ci-configuration/dockerfile:1.11 +# CI runner image based on common-ci-configuration docker-builder +FROM registry.gitlab.syncad.com/hive/common-ci-configuration/docker-builder:latest USER root RUN <<-EOF - # Install system dependencies - apk add --no-cache 7zip + # Install additional CI dependencies + apk add --no-cache \ + 7zip \ + postgresql16-client \ + sudo EOF +# Create hived user for compatibility with existing scripts +RUN adduser -D -s /bin/bash hived && \ + echo "hived ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + USER hived \ No newline at end of file diff --git a/scripts/ci-helpers/skip_rules.yml b/scripts/ci-helpers/skip_rules.yml new file mode 100644 index 00000000..b63c5a80 --- /dev/null +++ b/scripts/ci-helpers/skip_rules.yml @@ -0,0 +1,225 @@ +# Skip rules for CI job optimization based on changed files +# Include this file and use the patterns/templates in job rules +# +# Features: +# 1. Skip builds AND tests for docs-only changes +# 2. QUICK_TEST mode - use cached HAF data to skip prepare_haf_data job +# 3. Run full pipeline for source code changes +# +# Usage: +# Set QUICK_TEST=true and QUICK_TEST_HAF_COMMIT= to skip HAF rebuild/replay + +variables: + # Control variable to force full pipeline regardless of changes + FORCE_FULL_PIPELINE: "false" + # Set by detect_changes job - indicates only docs/non-source files changed + DOCS_ONLY: "false" + +# ============================================================================ +# Change Detection Job +# ============================================================================ + +.detect_changes: + stage: lint + image: alpine:latest + needs: [] + before_script: + - apk add --no-cache git + script: + - | + echo "Detecting what files changed..." + + # Determine base commit to compare against + if [ -n "$CI_MERGE_REQUEST_DIFF_BASE_SHA" ]; then + BASE_SHA="$CI_MERGE_REQUEST_DIFF_BASE_SHA" + echo "Using MR diff base: $BASE_SHA" + elif [ -n "$CI_COMMIT_BEFORE_SHA" ] && [ "$CI_COMMIT_BEFORE_SHA" != "0000000000000000000000000000000000000000" ]; then + BASE_SHA="$CI_COMMIT_BEFORE_SHA" + echo "Using commit before SHA: $BASE_SHA" + else + BASE_SHA="HEAD~1" + echo "Using HEAD~1 as base" + fi + + # Get list of changed files + echo "Changed files:" + git diff --name-only "$BASE_SHA" HEAD 2>/dev/null || git diff --name-only HEAD~1 HEAD | head -50 + + # Check if source code changed (files that require builds/sync) + SOURCE_CHANGED="false" + if git diff --name-only "$BASE_SHA" HEAD 2>/dev/null | grep -qE '^(backend/|endpoints/|docker/|scripts/|submodules/|Dockerfile|\.gitlab-ci\.yml)'; then + SOURCE_CHANGED="true" + fi + + # Check if tests changed + TESTS_CHANGED="false" + if git diff --name-only "$BASE_SHA" HEAD 2>/dev/null | grep -qE '^tests/'; then + TESTS_CHANGED="true" + fi + + # Determine if this is a docs-only change + # Docs-only means: only docs/readme/misc files changed, no source code or tests + DOCS_ONLY="false" + if [ "$SOURCE_CHANGED" = "false" ] && [ "$TESTS_CHANGED" = "false" ]; then + echo "No source or test changes detected" + # Verify something actually changed (not empty commit) + if git diff --name-only "$BASE_SHA" HEAD 2>/dev/null | grep -q .; then + DOCS_ONLY="true" + echo ">>> DOCS_ONLY=true - skipping builds and tests" + fi + fi + + echo "" + echo "Detection results:" + echo " SOURCE_CHANGED=$SOURCE_CHANGED" + echo " TESTS_CHANGED=$TESTS_CHANGED" + echo " DOCS_ONLY=$DOCS_ONLY" + + # Write to dotenv file for other jobs + echo "DOCS_ONLY=$DOCS_ONLY" > detect_changes.env + echo "SOURCE_CHANGED=$SOURCE_CHANGED" >> detect_changes.env + artifacts: + reports: + dotenv: detect_changes.env + rules: + # Skip detection on protected branches - always run full pipeline + - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "master" + when: never + # Skip on tags + - if: $CI_COMMIT_TAG + when: never + # Skip if forcing full pipeline + - if: $FORCE_FULL_PIPELINE == "true" + when: never + # Skip if QUICK_TEST is manually enabled + - if: $QUICK_TEST == "true" + when: never + - when: on_success + tags: + - public-runner-docker + +detect_changes: + extends: .detect_changes + +# ============================================================================ +# Rule Templates for Jobs +# ============================================================================ + +# Template: Skip build/sync jobs if only docs changed or QUICK_TEST enabled +.skip_on_docs_only_or_quick_test: + rules: + # QUICK_TEST mode - skip builds + - if: $QUICK_TEST == "true" + when: never + # DOCS_ONLY mode - skip builds + - if: $DOCS_ONLY == "true" + when: never + # Force full pipeline if requested + - if: $FORCE_FULL_PIPELINE == "true" + when: on_success + # Always run on protected branches + - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "master" + when: on_success + # Always run on tags + - if: $CI_COMMIT_TAG + when: on_success + # Run if source files changed + - changes: + paths: + - backend/**/* + - endpoints/**/* + - docker/**/* + - scripts/**/* + - submodules/**/* + - Dockerfile + - .gitlab-ci.yml + when: on_success + # Skip otherwise (docs-only changes) + - when: never + +# Template: Skip test jobs if only docs changed +.skip_test_on_docs_only: + rules: + # DOCS_ONLY mode - skip tests + - if: $DOCS_ONLY == "true" + when: never + # Force full pipeline if requested + - if: $FORCE_FULL_PIPELINE == "true" + when: on_success + # Always run on protected branches + - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "master" + when: on_success + # Always run on tags + - if: $CI_COMMIT_TAG + when: on_success + # Run if any non-doc files changed + - changes: + paths: + - backend/**/* + - endpoints/**/* + - docker/**/* + - scripts/**/* + - submodules/**/* + - tests/**/* + - Dockerfile + - .gitlab-ci.yml + when: on_success + # Skip otherwise (docs-only changes) + - when: never + +# Template: Skip test jobs if docs-only, with QUICK_TEST support +# In QUICK_TEST mode, tests still run (using cached data) +.skip_test_on_docs_only_with_quick_test: + rules: + # DOCS_ONLY mode - skip tests + - if: $DOCS_ONLY == "true" + when: never + # Force full pipeline if requested + - if: $FORCE_FULL_PIPELINE == "true" + when: on_success + # QUICK_TEST mode - run tests with cached data + - if: $QUICK_TEST == "true" + when: on_success + # Always run on protected branches + - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "master" + when: on_success + # Always run on tags + - if: $CI_COMMIT_TAG + when: on_success + # Run if any non-doc files changed + - changes: + paths: + - backend/**/* + - endpoints/**/* + - docker/**/* + - scripts/**/* + - submodules/**/* + - tests/**/* + - Dockerfile + - .gitlab-ci.yml + when: on_success + # Skip otherwise (docs-only changes) + - when: never + +# Template: Manual on feature branches, auto on integration branches +.manual_on_feature_branches: + rules: + # Force full pipeline overrides everything + - if: $FORCE_FULL_PIPELINE == "true" + when: on_success + # DOCS_ONLY mode - skip entirely + - if: $DOCS_ONLY == "true" + when: never + # QUICK_TEST mode - make jobs manual + - if: $QUICK_TEST == "true" + when: manual + allow_failure: true + # Always run automatically on protected branches + - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "master" + when: on_success + # Always run automatically on tags + - if: $CI_COMMIT_TAG + when: on_success + # Feature branches: manual by default + - when: manual + allow_failure: true diff --git a/submodules/haf b/submodules/haf index 48407d1a..b4225f9d 160000 --- a/submodules/haf +++ b/submodules/haf @@ -1 +1 @@ -Subproject commit 48407d1a5c08bef22d4cab7262e96e4202b8d99a +Subproject commit b4225f9d2591195b0e6aadf36bbef921d95f92b9 -- GitLab From 830a156179028a9f110fad765ee68effe6f7eb7d Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Mon, 29 Dec 2025 23:06:17 -0500 Subject: [PATCH 013/108] Switch to git fetch strategy with pre_get_sources hook - Use GIT_STRATEGY: fetch with GIT_DEPTH: 0 for efficient incremental fetches - Add pre_get_sources_script hook to clean corrupt git state from cancelled pipelines - Add separate GIT_CLONE_PATH to prevent clone-strategy jobs from erasing workspaces - Add .sqlfluffignore to exclude submodules from SQL linting --- .gitlab-ci.yml | 85 +++++++++++++++++++++++++++++++++++++++++++++++-- .sqlfluffignore | 2 ++ 2 files changed, 84 insertions(+), 3 deletions(-) create mode 100644 .sqlfluffignore diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 951343cd..53fda82f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,10 +8,16 @@ stages: variables: # Git configuration - GIT_STRATEGY: clone + # Fetch strategy reuses workspace between jobs, reducing GitLab server load. + # Full clone (depth 0) enables efficient incremental fetches - shallow clones + # don't reduce server CPU and make fetch less effective. + GIT_STRATEGY: fetch + GIT_DEPTH: 0 + GIT_SUBMODULE_DEPTH: 0 GIT_SUBMODULE_STRATEGY: recursive - GIT_DEPTH: 1 - GIT_SUBMODULE_DEPTH: 1 + # Temporary: separate clone path prevents clone-strategy jobs from erasing + # fetch workspaces during transition. Remove once all projects use fetch. + GIT_CLONE_PATH: $CI_BUILDS_DIR/fetch/$CI_RUNNER_SHORT_TOKEN/$CI_CONCURRENT_ID/$CI_PROJECT_PATH GIT_SUBMODULE_UPDATE_FLAGS: --jobs 4 # HAF configuration DATA_CACHE_HAF_PREFIX: "/cache/replay_data_haf" @@ -46,6 +52,79 @@ include: # Skip rules for docs-only changes and QUICK_TEST mode - local: '/scripts/ci-helpers/skip_rules.yml' +default: + hooks: + pre_get_sources_script: + # Clean corrupt git state left by cancelled pipelines (see GitLab #296638, #4600) + # Also handles directory-to-submodule transitions when switching branches + # Wrapped in subshell to avoid changing working directory for subsequent git operations + - | + ( + cd "${CI_PROJECT_DIR:-/builds}" 2>/dev/null || exit 0 + echo "pre_get_sources: checking $(pwd) for corrupt git state" + if [ -d ".git" ]; then + # Remove stale lock files that block git operations + find .git -name "*.lock" -delete 2>/dev/null || true + + # Check if main repo is corrupt - if so, remove .git to force fresh clone + if ! git rev-parse HEAD >/dev/null 2>&1; then + echo "pre_get_sources: main repository corrupt, forcing fresh clone" + rm -rf .git + else + # Main repo OK - check and clean corrupt submodules + # Check both the working dir and .git/modules/ since either can be corrupt + if [ -f ".gitmodules" ]; then + git config --file .gitmodules --get-regexp path 2>/dev/null | awk '{print $2}' | while read submod; do + needs_clean=false + [ -z "$submod" ] && continue + # Check if submodule working directory exists but is corrupt + if [ -d "$submod" ] && [ -f "$submod/.git" ]; then + if ! git -C "$submod" rev-parse HEAD >/dev/null 2>&1; then + needs_clean=true + fi + fi + # Check if .git/modules exists but is corrupt (even if working dir is gone) + if [ -d ".git/modules/$submod" ]; then + if ! git --git-dir=".git/modules/$submod" rev-parse HEAD >/dev/null 2>&1; then + echo "pre_get_sources: $submod corrupt (rev-parse failed)" + needs_clean=true + fi + fi + if [ "$needs_clean" = true ]; then + echo "pre_get_sources: cleaning corrupt submodule: $submod" + rm -rf "$submod" ".git/modules/$submod" + fi + done + fi + + # Handle directory-to-submodule transitions: fetch target ref's .gitmodules + # and remove any paths that exist as regular directories (not submodules) + if [ -n "$CI_COMMIT_REF_NAME" ]; then + echo "pre_get_sources: checking for directory-to-submodule transitions (ref: $CI_COMMIT_REF_NAME)" + # Fetch the target ref first (it may not exist locally yet) + git fetch origin "$CI_COMMIT_REF_NAME" --depth=1 2>&1 || true + target_gitmodules=$(git show "origin/$CI_COMMIT_REF_NAME:.gitmodules" 2>/dev/null) || true + if [ -n "$target_gitmodules" ]; then + echo "$target_gitmodules" | grep "path = " | sed 's/.*path = //' | while read submod; do + [ -z "$submod" ] && continue + # If path exists as a regular directory (not a submodule), remove it + if [ -d "$submod" ] && [ ! -f "$submod/.git" ]; then + echo "pre_get_sources: removing directory for submodule transition: $submod" + rm -rf "$submod" + fi + done + else + echo "pre_get_sources: no target gitmodules found" + fi + fi + + echo "pre_get_sources: existing repo OK" + fi + else + echo "pre_get_sources: no .git directory (fresh workspace)" + fi + ) + .lint_job: extends: .job-defaults stage: lint diff --git a/.sqlfluffignore b/.sqlfluffignore new file mode 100644 index 00000000..13dd8e45 --- /dev/null +++ b/.sqlfluffignore @@ -0,0 +1,2 @@ +# Exclude submodules directory (populated by other jobs using fetch strategy) +submodules/ -- GitLab From 85db1f1504c4fce44e6af26604a2340dcc9223a2 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Mon, 29 Dec 2025 23:21:03 -0500 Subject: [PATCH 014/108] Fix: Remove branch specification from HAF submodule --- .gitmodules | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index d6b8aaf7..8b3eeb2d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,7 +7,6 @@ [submodule "submodules/haf"] path = submodules/haf url = ../haf.git - branch = feature/nfs-cache-manager [submodule "submodules/reptracker"] path = submodules/reptracker url = ../reputation_tracker.git -- GitLab From 7243a9b75ba900344abaeb570ab6e06944ed62ba Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Mon, 29 Dec 2025 23:38:54 -0500 Subject: [PATCH 015/108] Fix: Use Python 3.12 for API client jobs (PyO3 incompatible with 3.14) --- .gitlab-ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 53fda82f..bb1734a2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -369,6 +369,8 @@ generate-wax-spec: generate_python_api_client: extends: .project_develop_configuration_template + # Override image to Python 3.12 - api_client_generator doesn't support Python 3.14 yet + image: registry.gitlab.syncad.com/hive/common-ci-configuration/python:3.12.9-1 stage: build rules: - if: $DOCS_ONLY == "true" @@ -390,6 +392,8 @@ generate_python_api_client: build_python_api_client_wheel: extends: .build_wheel_template + # Use Python 3.12 for consistency with generate_python_api_client + image: registry.gitlab.syncad.com/hive/common-ci-configuration/python:3.12.9-1 stage: build rules: - if: $DOCS_ONLY == "true" @@ -619,6 +623,8 @@ sync: python_api_client_test: extends: .project_develop_configuration_template + # Use Python 3.12 for consistency with generate_python_api_client + image: registry.gitlab.syncad.com/hive/common-ci-configuration/python:3.12.9-1 stage: test rules: - if: $DOCS_ONLY == "true" -- GitLab From 7e5c730d9f5dc18225e8fd56fc76f04d985f6758 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Mon, 29 Dec 2025 23:52:23 -0500 Subject: [PATCH 016/108] Fix: Exclude submodules from shellcheck linting --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bb1734a2..43c3f1e1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -142,7 +142,7 @@ lint_bash_scripts: before_script: - apk add xmlstarlet script: - - find . -name .git -type d -prune -o -type f -name \*.sh -exec shellcheck -f checkstyle + - find . -name .git -type d -prune -o -name submodules -type d -prune -o -type f -name \*.sh -exec shellcheck -f checkstyle {} + | tee shellcheck-checkstyle-result.xml after_script: - xmlstarlet tr misc/checkstyle2junit.xslt shellcheck-checkstyle-result.xml > shellcheck-junit-result.xml -- GitLab From 4eeb544bfa54ac405f62621d1e8f648de52ce688 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 00:01:21 -0500 Subject: [PATCH 017/108] Fix submodule fetch errors for artifact-only jobs Add GIT_SUBMODULE_STRATEGY: none to jobs that only use artifacts from prior stages and don't need to fetch submodules. This avoids 'transport file not allowed' errors when nested submodules have local file path references in fetch strategy workspaces. Jobs updated: - generate_python_api_client - build_python_api_client_wheel - python_api_client_test - deploy_python_api_packages_to_gitlab - deploy-wax-spec-dev-package - deploy-wax-spec-production-public-npm --- .gitlab-ci.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 43c3f1e1..1823b023 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -378,6 +378,8 @@ generate_python_api_client: - when: on_success variables: PYPROJECT_DIR: "${CI_PROJECT_DIR}/scripts/python_api_package" + # Job uses artifacts, doesn't need submodules + GIT_SUBMODULE_STRATEGY: none needs: - job: extract-swagger-json artifacts: true @@ -404,6 +406,8 @@ build_python_api_client_wheel: artifacts: true variables: PYPROJECT_DIR: "${CI_PROJECT_DIR}/scripts/python_api_package" + # Job uses artifacts, doesn't need submodules + GIT_SUBMODULE_STRATEGY: none tags: - public-runner-docker @@ -635,6 +639,8 @@ python_api_client_test: artifacts: true variables: PYPROJECT_DIR: "${CI_PROJECT_DIR}/scripts/python_api_package" + # Job uses artifacts, doesn't need submodules + GIT_SUBMODULE_STRATEGY: none script: - pytest "${PYPROJECT_DIR}/tests" tags: @@ -835,6 +841,8 @@ deploy_python_api_packages_to_gitlab: extends: .deploy_wheel_to_gitlab_template variables: PYPROJECT_DIR: "${CI_PROJECT_DIR}/scripts/python_api_package" + # Job uses artifacts, doesn't need submodules + GIT_SUBMODULE_STRATEGY: none when: on_success tags: - public-runner-docker @@ -846,6 +854,8 @@ deploy-wax-spec-dev-package: SOURCE_DIR: "${PACKAGE_SOURCE_DIR}" PACKAGE_TGZ_PATH: "${BUILT_PACKAGE_PATH}" NPM_PACKAGE_SCOPE: "@hiveio" + # Job uses artifacts, doesn't need submodules + GIT_SUBMODULE_STRATEGY: none needs: - job: generate-wax-spec artifacts: true @@ -860,6 +870,8 @@ deploy-wax-spec-production-public-npm: NPM_PACKAGE_NAME: "wax-api-hafbe" SOURCE_DIR: "${PACKAGE_SOURCE_DIR}" PACKAGE_TGZ_PATH: "${BUILT_PACKAGE_PATH}" + # Job uses artifacts, doesn't need submodules + GIT_SUBMODULE_STRATEGY: none needs: - job: generate-wax-spec artifacts: true -- GitLab From 4753a18aa5c9b25b0524bd95ac17620fec61aa55 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 00:02:26 -0500 Subject: [PATCH 018/108] Add GIT_SUBMODULE_STRATEGY: none to detect_changes job --- scripts/ci-helpers/skip_rules.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/ci-helpers/skip_rules.yml b/scripts/ci-helpers/skip_rules.yml index b63c5a80..e488bc49 100644 --- a/scripts/ci-helpers/skip_rules.yml +++ b/scripts/ci-helpers/skip_rules.yml @@ -23,6 +23,9 @@ variables: stage: lint image: alpine:latest needs: [] + variables: + # Job only checks git diff, doesn't need submodules + GIT_SUBMODULE_STRATEGY: none before_script: - apk add --no-cache git script: -- GitLab From 524972bfc76c2caf21a9472b82d3773da06207b7 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 00:04:59 -0500 Subject: [PATCH 019/108] Fix nested submodule fetch errors - Change GIT_SUBMODULE_STRATEGY from 'recursive' to 'normal' to avoid file:// URL errors for deeply nested submodules (e.g. btracker/haf/hive) - Add GIT_SUBMODULE_STRATEGY: none to extract-swagger-json and generate-wax-spec jobs since they only need main repo files and artifacts --- .gitlab-ci.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1823b023..c84f96e3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -14,7 +14,8 @@ variables: GIT_STRATEGY: fetch GIT_DEPTH: 0 GIT_SUBMODULE_DEPTH: 0 - GIT_SUBMODULE_STRATEGY: recursive + # Use 'normal' not 'recursive' - recursive causes file:// URL errors for nested submodules + GIT_SUBMODULE_STRATEGY: normal # Temporary: separate clone path prevents clone-strategy jobs from erasing # fetch workspaces during transition. Remove once all projects use fetch. GIT_CLONE_PATH: $CI_BUILDS_DIR/fetch/$CI_RUNNER_SHORT_TOKEN/$CI_CONCURRENT_ID/$CI_PROJECT_PATH @@ -346,6 +347,8 @@ extract-swagger-json: - when: on_success variables: INPUT_SQL_SWAGGER_FILE: "${CI_PROJECT_DIR}/endpoints/endpoint_schema.sql" + # Job only needs main repo files, not submodules + GIT_SUBMODULE_STRATEGY: none tags: - public-runner-docker @@ -361,6 +364,8 @@ generate-wax-spec: API_TYPE: "rest" NPM_PACKAGE_SCOPE: "@hiveio" NPM_PACKAGE_NAME: "wax-api-hafbe" + # Job uses artifacts, doesn't need submodules + GIT_SUBMODULE_STRATEGY: none needs: - job: extract-swagger-json artifacts: true -- GitLab From 690dca1c832419a016fbb537aae1686dcbba4eeb Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 00:08:02 -0500 Subject: [PATCH 020/108] Fix wax-spec deploy jobs to use CI_PROJECT_DIR for artifact paths The PACKAGE_SOURCE_DIR dotenv variable contains runner-specific paths (GIT_CLONE_PATH) that don't work when the deploy job runs on a different runner. Use CI_PROJECT_DIR instead which is consistent across runners. --- .gitlab-ci.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c84f96e3..df44565c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -856,8 +856,9 @@ deploy-wax-spec-dev-package: extends: .npm_deploy_package_template stage: publish variables: - SOURCE_DIR: "${PACKAGE_SOURCE_DIR}" - PACKAGE_TGZ_PATH: "${BUILT_PACKAGE_PATH}" + # Use CI_PROJECT_DIR instead of PACKAGE_SOURCE_DIR which has runner-specific paths + SOURCE_DIR: "${CI_PROJECT_DIR}/build/generated" + PACKAGE_TGZ_PATH: "${CI_PROJECT_DIR}/build/wax-api-hafbe.tgz" NPM_PACKAGE_SCOPE: "@hiveio" # Job uses artifacts, doesn't need submodules GIT_SUBMODULE_STRATEGY: none @@ -873,8 +874,9 @@ deploy-wax-spec-production-public-npm: variables: NPM_PUBLISH_TOKEN: "$INTERNAL_HIDDEN_PUBLISH_TOKEN" NPM_PACKAGE_NAME: "wax-api-hafbe" - SOURCE_DIR: "${PACKAGE_SOURCE_DIR}" - PACKAGE_TGZ_PATH: "${BUILT_PACKAGE_PATH}" + # Use CI_PROJECT_DIR instead of PACKAGE_SOURCE_DIR which has runner-specific paths + SOURCE_DIR: "${CI_PROJECT_DIR}/build/generated" + PACKAGE_TGZ_PATH: "${CI_PROJECT_DIR}/build/wax-api-hafbe.tgz" # Job uses artifacts, doesn't need submodules GIT_SUBMODULE_STRATEGY: none needs: -- GitLab From 2b36bb1c0615b7a84c851042bd2981b0ad5dbf9d Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 00:11:03 -0500 Subject: [PATCH 021/108] Fix wax-spec deploy to find tgz dynamically The package filename includes version and timestamp, can't be hardcoded. Use before_script to find the actual tgz file in build/ directory. --- .gitlab-ci.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index df44565c..0d510b52 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -858,10 +858,13 @@ deploy-wax-spec-dev-package: variables: # Use CI_PROJECT_DIR instead of PACKAGE_SOURCE_DIR which has runner-specific paths SOURCE_DIR: "${CI_PROJECT_DIR}/build/generated" - PACKAGE_TGZ_PATH: "${CI_PROJECT_DIR}/build/wax-api-hafbe.tgz" NPM_PACKAGE_SCOPE: "@hiveio" # Job uses artifacts, doesn't need submodules GIT_SUBMODULE_STRATEGY: none + before_script: + # Fix PACKAGE_TGZ_PATH - dotenv has runner-specific path, need to find actual file + - export PACKAGE_TGZ_PATH=$(ls ${CI_PROJECT_DIR}/build/*.tgz 2>/dev/null | head -1) + - echo "Using PACKAGE_TGZ_PATH=$PACKAGE_TGZ_PATH" needs: - job: generate-wax-spec artifacts: true @@ -876,9 +879,12 @@ deploy-wax-spec-production-public-npm: NPM_PACKAGE_NAME: "wax-api-hafbe" # Use CI_PROJECT_DIR instead of PACKAGE_SOURCE_DIR which has runner-specific paths SOURCE_DIR: "${CI_PROJECT_DIR}/build/generated" - PACKAGE_TGZ_PATH: "${CI_PROJECT_DIR}/build/wax-api-hafbe.tgz" # Job uses artifacts, doesn't need submodules GIT_SUBMODULE_STRATEGY: none + before_script: + # Fix PACKAGE_TGZ_PATH - dotenv has runner-specific path, need to find actual file + - export PACKAGE_TGZ_PATH=$(ls ${CI_PROJECT_DIR}/build/*.tgz 2>/dev/null | head -1) + - echo "Using PACKAGE_TGZ_PATH=$PACKAGE_TGZ_PATH" needs: - job: generate-wax-spec artifacts: true -- GitLab From 6b0553bf4330f6fc9d146af381e182b0e7e46fca Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 00:13:26 -0500 Subject: [PATCH 022/108] Fix wax-spec deploy before_script to include nvm setup When overriding before_script, must replicate the template's setup that initializes nvm and node/pnpm environment. --- .gitlab-ci.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0d510b52..db900c0a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -862,6 +862,10 @@ deploy-wax-spec-dev-package: # Job uses artifacts, doesn't need submodules GIT_SUBMODULE_STRATEGY: none before_script: + # Replicate template's before_script setup + - git config --global --add safe.directory '*' + - . "${NVM_DIR}/nvm.sh" + - nvm use "${NODEJS_VERSION}" # Fix PACKAGE_TGZ_PATH - dotenv has runner-specific path, need to find actual file - export PACKAGE_TGZ_PATH=$(ls ${CI_PROJECT_DIR}/build/*.tgz 2>/dev/null | head -1) - echo "Using PACKAGE_TGZ_PATH=$PACKAGE_TGZ_PATH" @@ -882,6 +886,10 @@ deploy-wax-spec-production-public-npm: # Job uses artifacts, doesn't need submodules GIT_SUBMODULE_STRATEGY: none before_script: + # Replicate template's before_script setup + - git config --global --add safe.directory '*' + - . "${NVM_DIR}/nvm.sh" + - nvm use "${NODEJS_VERSION}" # Fix PACKAGE_TGZ_PATH - dotenv has runner-specific path, need to find actual file - export PACKAGE_TGZ_PATH=$(ls ${CI_PROJECT_DIR}/build/*.tgz 2>/dev/null | head -1) - echo "Using PACKAGE_TGZ_PATH=$PACKAGE_TGZ_PATH" -- GitLab From 0a55e448138b8341172cc987223d23a5c47b3bb9 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 00:16:12 -0500 Subject: [PATCH 023/108] Fix validate_haf_commit to avoid nested submodule issues Use GIT_SUBMODULE_STRATEGY: none and manually init only the haf submodule without recursion, avoiding file:// URL errors from stale nested submodule configurations. --- .gitlab-ci.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index db900c0a..2e1f752a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -198,11 +198,16 @@ quick_test_setup: validate_haf_commit: stage: build image: alpine:latest + variables: + # Avoid nested submodule issues - manually init just haf submodule + GIT_SUBMODULE_STRATEGY: none script: - | set -e apk add --no-cache git - SUBMODULE_COMMIT=$(cat .git/modules/submodules/haf/HEAD 2>/dev/null || git -C submodules/haf rev-parse HEAD) + # Manually init just the haf submodule (no recursion) + git submodule update --init submodules/haf + SUBMODULE_COMMIT=$(git -C submodules/haf rev-parse HEAD) INCLUDE_REF=$(grep -A2 "project:.*hive/haf" .gitlab-ci.yml | grep "ref:" | head -1 | sed 's/.*ref: *\([a-f0-9]*\).*/\1/' || true) echo "HAF_COMMIT variable: $HAF_COMMIT" echo "HAF submodule HEAD: $SUBMODULE_COMMIT" -- GitLab From 9e188b4b77429705bb7fc950863722efb9c65237 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 00:17:27 -0500 Subject: [PATCH 024/108] Clean stale git modules before submodule init in validate_haf_commit --- .gitlab-ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2e1f752a..88afacd6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -205,6 +205,9 @@ validate_haf_commit: - | set -e apk add --no-cache git + # Clean stale submodule state that may have file:// URLs + rm -rf .git/modules/submodules + git submodule deinit -f --all 2>/dev/null || true # Manually init just the haf submodule (no recursion) git submodule update --init submodules/haf SUBMODULE_COMMIT=$(git -C submodules/haf rev-parse HEAD) -- GitLab From 4fce3910c660676be1dec51b984b3e825c5dd0c7 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 00:26:53 -0500 Subject: [PATCH 025/108] Remove obsolete hived and hived-for-tests runner tags --- .gitlab-ci.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 88afacd6..51731b04 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -255,8 +255,6 @@ prepare_haf_image: - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf tags: - public-runner-docker - - hived-for-tests - - hived prepare_haf_data: extends: @@ -317,8 +315,6 @@ prepare_haf_data: echo -e "\e[0Ksection_end:$(date +%s):build\r\e[0K" tags: - public-runner-docker - - hived-for-tests - - hived docker-ci-runner-build: extends: .docker-base-build-template @@ -840,8 +836,6 @@ build_and_publish_image: fi tags: - public-runner-docker - - hived-for-tests - - hived deploy_python_api_packages_to_gitlab: stage: publish -- GitLab From b4273bad6437dd8ae14cbe4cc8f8e0bed70bd33e Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 00:29:53 -0500 Subject: [PATCH 026/108] Fix CI runner Dockerfile and prepare_haf_image job - Dockerfile: Handle existing hived user in base image - prepare_haf_image: Init hive submodule for HAF script symlinks --- .gitlab-ci.yml | 4 ++++ docker/ci/Dockerfile | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 51731b04..7abc4a96 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -253,6 +253,10 @@ prepare_haf_image: REGISTRY_PASS: "$HAF_DEPLOY_TOKEN" before_script: - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf + # HAF scripts are symlinks to hive submodule - init it without recursion + - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive + - rm -rf $CI_PROJECT_DIR/submodules/haf/.git/modules 2>/dev/null || true + - git -C $CI_PROJECT_DIR/submodules/haf submodule update --init hive tags: - public-runner-docker diff --git a/docker/ci/Dockerfile b/docker/ci/Dockerfile index 05ee7ce2..84acd82c 100644 --- a/docker/ci/Dockerfile +++ b/docker/ci/Dockerfile @@ -11,8 +11,8 @@ RUN <<-EOF sudo EOF -# Create hived user for compatibility with existing scripts -RUN adduser -D -s /bin/bash hived && \ - echo "hived ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers +# Create hived user for compatibility with existing scripts (if not exists) +RUN id hived 2>/dev/null || adduser -D -s /bin/bash hived && \ + grep -q "^hived " /etc/sudoers || echo "hived ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers USER hived \ No newline at end of file -- GitLab From da1fbed242342fc043ca2825fdb18839b3f2f143 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 00:31:18 -0500 Subject: [PATCH 027/108] Add GIT_SUBMODULE_STRATEGY: none to docker-ci-runner-build --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7abc4a96..9f0d6641 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -331,6 +331,8 @@ docker-ci-runner-build: BASE_TAG: "docker-26.1.4-1" NAME: "ci-runner" TARGET: "ci-runner-ci" + # Only builds Dockerfile, doesn't need submodules + GIT_SUBMODULE_STRATEGY: none docker-setup-docker-image-build: extends: .docker-base-build-template -- GitLab From 2db48766f04c1c741b8aa7115cf8bf037ddd0d1b Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 00:46:40 -0500 Subject: [PATCH 028/108] Fix prepare_haf_data to avoid nested submodule issues --- .gitlab-ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9f0d6641..f9dce63b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -276,6 +276,12 @@ prepare_haf_data: SUBMODULE_DIR: "$CI_PROJECT_DIR/submodules/haf" BLOCK_LOG_SOURCE_DIR: $BLOCK_LOG_SOURCE_DIR_5M CONFIG_INI_SOURCE: "$CI_PROJECT_DIR/submodules/haf/docker/config_5M.ini" + # Avoid nested submodule issues + GIT_SUBMODULE_STRATEGY: none + before_script: + # Manually init just the haf submodule without nested submodules + - rm -rf .git/modules/submodules 2>/dev/null || true + - git submodule update --init submodules/haf tags: - data-cache-storage - fast -- GitLab From 9378742c6e4bcdab0ef607cd782707c5fd06c90f Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 00:58:09 -0500 Subject: [PATCH 029/108] Fix git modules cleanup for prepare_haf_image --- .gitlab-ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f9dce63b..f46dc46b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -253,9 +253,11 @@ prepare_haf_image: REGISTRY_PASS: "$HAF_DEPLOY_TOKEN" before_script: - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf - # HAF scripts are symlinks to hive submodule - init it without recursion - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive + # Clean stale git modules state (may have wrong permissions) + - rm -rf .git/modules/submodules/haf/modules 2>/dev/null || true - rm -rf $CI_PROJECT_DIR/submodules/haf/.git/modules 2>/dev/null || true + # HAF scripts are symlinks to hive submodule - init it without recursion - git -C $CI_PROJECT_DIR/submodules/haf submodule update --init hive tags: - public-runner-docker -- GitLab From aba4df70cccc8a2cbfbce59115debc413988cf5e Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 01:00:51 -0500 Subject: [PATCH 030/108] Fix: Remove entire .git/modules/submodules/haf dir to fix permission issues --- .gitlab-ci.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f46dc46b..7125c9c9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -254,9 +254,12 @@ prepare_haf_image: before_script: - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive - # Clean stale git modules state (may have wrong permissions) - - rm -rf .git/modules/submodules/haf/modules 2>/dev/null || true - - rm -rf $CI_PROJECT_DIR/submodules/haf/.git/modules 2>/dev/null || true + # Clean stale git modules state (may have wrong permissions from previous runs) + - sudo rm -rf .git/modules/submodules/haf 2>/dev/null || rm -rf .git/modules/submodules/haf 2>/dev/null || true + - sudo rm -rf $CI_PROJECT_DIR/submodules/haf/.git/modules 2>/dev/null || rm -rf $CI_PROJECT_DIR/submodules/haf/.git/modules 2>/dev/null || true + # Deinit and reinit the haf submodule to get clean state + - git submodule deinit -f submodules/haf 2>/dev/null || true + - git submodule update --init submodules/haf # HAF scripts are symlinks to hive submodule - init it without recursion - git -C $CI_PROJECT_DIR/submodules/haf submodule update --init hive tags: -- GitLab From a6622a0cebb9eb6a740cf04ffcc729e4854d23da Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 01:03:16 -0500 Subject: [PATCH 031/108] Fix: Add main project dir to safe.directory to fix dubious ownership error --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7125c9c9..d06b6c46 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -252,6 +252,7 @@ prepare_haf_image: REGISTRY_USER: "$HAF_DEPLOY_USERNAME" REGISTRY_PASS: "$HAF_DEPLOY_TOKEN" before_script: + - git config --global --add safe.directory $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive # Clean stale git modules state (may have wrong permissions from previous runs) -- GitLab From c51263a65b80e0a9141d4c680ed75ab5760b6e31 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 01:06:03 -0500 Subject: [PATCH 032/108] Fix: Add safe.directory to prepare_haf_data for dubious ownership error --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d06b6c46..53aeafea 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -285,6 +285,8 @@ prepare_haf_data: # Avoid nested submodule issues GIT_SUBMODULE_STRATEGY: none before_script: + - git config --global --add safe.directory $CI_PROJECT_DIR + - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf # Manually init just the haf submodule without nested submodules - rm -rf .git/modules/submodules 2>/dev/null || true - git submodule update --init submodules/haf -- GitLab From a67a5f4f50066ee7b41cd70d90a63a55978dbbb4 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 01:07:58 -0500 Subject: [PATCH 033/108] Fix: Init hive submodule inside HAF for prepare_haf_data job --- .gitlab-ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 53aeafea..4661b0cf 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -287,9 +287,12 @@ prepare_haf_data: before_script: - git config --global --add safe.directory $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf + - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive # Manually init just the haf submodule without nested submodules - rm -rf .git/modules/submodules 2>/dev/null || true - git submodule update --init submodules/haf + # HAF scripts are symlinks to hive submodule - init it without recursion + - git -C $CI_PROJECT_DIR/submodules/haf submodule update --init hive tags: - data-cache-storage - fast -- GitLab From d2a1e0544648c18086433e606be773deeb0573d5 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 01:17:06 -0500 Subject: [PATCH 034/108] Fix: Add GIT_SUBMODULE_STRATEGY none and manual submodule init to sync job --- .gitlab-ci.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4661b0cf..456edca4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -472,6 +472,8 @@ sync: POSTGRES_ACCESS: postgresql://haf_admin@docker:5432/haf_block_log COMPOSE_OPTIONS_STRING: --env-file ci.env --file docker-compose.yml --file overrides/ci.yml --ansi never + # Avoid nested submodule issues + GIT_SUBMODULE_STRATEGY: none timeout: 1 hours before_script: - | @@ -481,6 +483,11 @@ sync: echo -e "\e[0Ksection_start:$(date +%s):git[collapsed=true]\r\e[0KConfiguring Git..." git config --global --add safe.directory "$CI_PROJECT_DIR" git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf" + git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf/hive" + # Manually init submodules without nested recursion + rm -rf .git/modules/submodules 2>/dev/null || true + git submodule update --init submodules/haf + git -C "$CI_PROJECT_DIR/submodules/haf" submodule update --init hive echo -e "\e[0Ksection_end:$(date +%s):git\r\e[0K" - | # Ensure HAF replay data is available locally (fetch from NFS if needed) -- GitLab From 9ab4e57d2d4a14f2817a8d673432e6eae6c91485 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 01:20:02 -0500 Subject: [PATCH 035/108] Fix: Remove stale submodule directories before reinitializing --- .gitlab-ci.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 456edca4..7979f7df 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -288,9 +288,11 @@ prepare_haf_data: - git config --global --add safe.directory $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive - # Manually init just the haf submodule without nested submodules + # Clean up any stale submodule state - rm -rf .git/modules/submodules 2>/dev/null || true - - git submodule update --init submodules/haf + - rm -rf submodules/haf 2>/dev/null || true + # Manually init just the haf submodule without nested submodules + - git submodule update --init --force submodules/haf # HAF scripts are symlinks to hive submodule - init it without recursion - git -C $CI_PROJECT_DIR/submodules/haf submodule update --init hive tags: @@ -484,9 +486,11 @@ sync: git config --global --add safe.directory "$CI_PROJECT_DIR" git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf" git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf/hive" - # Manually init submodules without nested recursion + # Clean up any stale submodule state rm -rf .git/modules/submodules 2>/dev/null || true - git submodule update --init submodules/haf + rm -rf submodules/haf 2>/dev/null || true + # Manually init submodules without nested recursion + git submodule update --init --force submodules/haf git -C "$CI_PROJECT_DIR/submodules/haf" submodule update --init hive echo -e "\e[0Ksection_end:$(date +%s):git\r\e[0K" - | -- GitLab From aa537b9c94de8f328442615c9dd2bbd982898df3 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 01:22:21 -0500 Subject: [PATCH 036/108] Fix: Add GIT_SUBMODULE_STRATEGY none to prepare_haf_image job --- .gitlab-ci.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7979f7df..02f52769 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -251,16 +251,17 @@ prepare_haf_image: SUBMODULE_DIR: "$CI_PROJECT_DIR/submodules/haf" REGISTRY_USER: "$HAF_DEPLOY_USERNAME" REGISTRY_PASS: "$HAF_DEPLOY_TOKEN" + # Avoid nested submodule issues + GIT_SUBMODULE_STRATEGY: none before_script: - git config --global --add safe.directory $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive - # Clean stale git modules state (may have wrong permissions from previous runs) - - sudo rm -rf .git/modules/submodules/haf 2>/dev/null || rm -rf .git/modules/submodules/haf 2>/dev/null || true - - sudo rm -rf $CI_PROJECT_DIR/submodules/haf/.git/modules 2>/dev/null || rm -rf $CI_PROJECT_DIR/submodules/haf/.git/modules 2>/dev/null || true - # Deinit and reinit the haf submodule to get clean state - - git submodule deinit -f submodules/haf 2>/dev/null || true - - git submodule update --init submodules/haf + # Clean up any stale submodule state + - sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true + - rm -rf submodules/haf 2>/dev/null || true + # Manually init submodules without nested recursion + - git submodule update --init --force submodules/haf # HAF scripts are symlinks to hive submodule - init it without recursion - git -C $CI_PROJECT_DIR/submodules/haf submodule update --init hive tags: -- GitLab From 88a1feaf17a23cd783ca78fec41aa04dc555557c Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 01:31:58 -0500 Subject: [PATCH 037/108] Fix: Download cache-manager.sh from common-ci-configuration in sync job --- .gitlab-ci.yml | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 02f52769..5aca0b8c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -501,16 +501,15 @@ sync: echo "Local HAF cache found at ${LOCAL_HAF_CACHE}" else echo "Local HAF cache not found, checking NFS..." - CACHE_MANAGER="${CI_PROJECT_DIR}/submodules/haf/scripts/ci-helpers/cache-manager.sh" - if [[ -x "$CACHE_MANAGER" ]]; then - if CACHE_HANDLING=haf "$CACHE_MANAGER" get haf "${HAF_COMMIT}" "${LOCAL_HAF_CACHE}"; then - echo "Fetched HAF replay data from NFS cache" - else - echo "ERROR: Failed to fetch HAF replay data from NFS cache" - exit 1 - fi + CACHE_MANAGER="/tmp/cache-manager.sh" + if [[ ! -x "$CACHE_MANAGER" ]]; then + curl -fsSL "https://gitlab.syncad.com/hive/common-ci-configuration/-/raw/develop/scripts/cache-manager.sh" -o "$CACHE_MANAGER" + chmod +x "$CACHE_MANAGER" + fi + if CACHE_HANDLING=haf "$CACHE_MANAGER" get haf "${HAF_COMMIT}" "${LOCAL_HAF_CACHE}"; then + echo "Fetched HAF replay data from NFS cache" else - echo "ERROR: cache-manager.sh not found and local cache missing" + echo "ERROR: Failed to fetch HAF replay data from NFS cache" exit 1 fi fi -- GitLab From e2dd036e29aa8615cf89fa5b09bf8fc685250448 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 01:35:11 -0500 Subject: [PATCH 038/108] Fix: Use sudo to remove .git/modules in prepare_haf_data and sync jobs --- .gitlab-ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5aca0b8c..c5b3b5d0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -289,8 +289,8 @@ prepare_haf_data: - git config --global --add safe.directory $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive - # Clean up any stale submodule state - - rm -rf .git/modules/submodules 2>/dev/null || true + # Clean up any stale submodule state (may have wrong permissions from previous runs) + - sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true - rm -rf submodules/haf 2>/dev/null || true # Manually init just the haf submodule without nested submodules - git submodule update --init --force submodules/haf @@ -487,8 +487,8 @@ sync: git config --global --add safe.directory "$CI_PROJECT_DIR" git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf" git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf/hive" - # Clean up any stale submodule state - rm -rf .git/modules/submodules 2>/dev/null || true + # Clean up any stale submodule state (may have wrong permissions from previous runs) + sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true rm -rf submodules/haf 2>/dev/null || true # Manually init submodules without nested recursion git submodule update --init --force submodules/haf -- GitLab From ed2d37cb76c341d2081f10e48e2ec659fecc4d75 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 01:48:55 -0500 Subject: [PATCH 039/108] Fix: Init all submodules (not just haf) in sync job for docker-compose volumes --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c5b3b5d0..24153418 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -489,9 +489,9 @@ sync: git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf/hive" # Clean up any stale submodule state (may have wrong permissions from previous runs) sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true - rm -rf submodules/haf 2>/dev/null || true - # Manually init submodules without nested recursion - git submodule update --init --force submodules/haf + rm -rf submodules 2>/dev/null || true + # Manually init all submodules - HAF with its nested hive, plus btracker, hafah, reptracker + git submodule update --init --force git -C "$CI_PROJECT_DIR/submodules/haf" submodule update --init hive echo -e "\e[0Ksection_end:$(date +%s):git\r\e[0K" - | -- GitLab From c2418dbbec904f964f294ed172806de8754e5a48 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 01:53:38 -0500 Subject: [PATCH 040/108] Fix: Remove entire submodules dir and checkout before reinit to fix handle error --- .gitlab-ci.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 24153418..bae7d1c7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -257,9 +257,10 @@ prepare_haf_image: - git config --global --add safe.directory $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive - # Clean up any stale submodule state + # Clean up any stale submodule state - remove everything and reinit cleanly - sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true - - rm -rf submodules/haf 2>/dev/null || true + - rm -rf submodules 2>/dev/null || true + - git checkout -- submodules 2>/dev/null || true # Manually init submodules without nested recursion - git submodule update --init --force submodules/haf # HAF scripts are symlinks to hive submodule - init it without recursion @@ -289,9 +290,10 @@ prepare_haf_data: - git config --global --add safe.directory $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive - # Clean up any stale submodule state (may have wrong permissions from previous runs) + # Clean up any stale submodule state - remove everything and reinit cleanly - sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true - - rm -rf submodules/haf 2>/dev/null || true + - rm -rf submodules 2>/dev/null || true + - git checkout -- submodules 2>/dev/null || true # Manually init just the haf submodule without nested submodules - git submodule update --init --force submodules/haf # HAF scripts are symlinks to hive submodule - init it without recursion @@ -487,9 +489,10 @@ sync: git config --global --add safe.directory "$CI_PROJECT_DIR" git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf" git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf/hive" - # Clean up any stale submodule state (may have wrong permissions from previous runs) + # Clean up any stale submodule state - remove everything and reinit cleanly sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true rm -rf submodules 2>/dev/null || true + git checkout -- submodules 2>/dev/null || true # Manually init all submodules - HAF with its nested hive, plus btracker, hafah, reptracker git submodule update --init --force git -C "$CI_PROJECT_DIR/submodules/haf" submodule update --init hive -- GitLab From 4a111626f72a232a7643cda956907369df5b82ce Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 01:58:03 -0500 Subject: [PATCH 041/108] Fix: Use git submodule deinit before removing .git/modules to fix handle error --- .gitlab-ci.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bae7d1c7..d1495bb4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -257,10 +257,10 @@ prepare_haf_image: - git config --global --add safe.directory $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive - # Clean up any stale submodule state - remove everything and reinit cleanly + # Clean up any stale submodule state - deinit first, then remove + - git submodule deinit -f --all 2>/dev/null || true - sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true - rm -rf submodules 2>/dev/null || true - - git checkout -- submodules 2>/dev/null || true # Manually init submodules without nested recursion - git submodule update --init --force submodules/haf # HAF scripts are symlinks to hive submodule - init it without recursion @@ -290,10 +290,10 @@ prepare_haf_data: - git config --global --add safe.directory $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive - # Clean up any stale submodule state - remove everything and reinit cleanly + # Clean up any stale submodule state - deinit first, then remove + - git submodule deinit -f --all 2>/dev/null || true - sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true - rm -rf submodules 2>/dev/null || true - - git checkout -- submodules 2>/dev/null || true # Manually init just the haf submodule without nested submodules - git submodule update --init --force submodules/haf # HAF scripts are symlinks to hive submodule - init it without recursion @@ -489,10 +489,10 @@ sync: git config --global --add safe.directory "$CI_PROJECT_DIR" git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf" git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf/hive" - # Clean up any stale submodule state - remove everything and reinit cleanly + # Clean up any stale submodule state - deinit first, then remove + git submodule deinit -f --all 2>/dev/null || true sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true rm -rf submodules 2>/dev/null || true - git checkout -- submodules 2>/dev/null || true # Manually init all submodules - HAF with its nested hive, plus btracker, hafah, reptracker git submodule update --init --force git -C "$CI_PROJECT_DIR/submodules/haf" submodule update --init hive -- GitLab From 52900716c85aa0f10e1288575617c67f28912995 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 02:01:51 -0500 Subject: [PATCH 042/108] Fix: Use sudo rm for submodules directory to handle permission issues --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d1495bb4..1ae671c2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -260,7 +260,7 @@ prepare_haf_image: # Clean up any stale submodule state - deinit first, then remove - git submodule deinit -f --all 2>/dev/null || true - sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true - - rm -rf submodules 2>/dev/null || true + - sudo rm -rf submodules 2>/dev/null || rm -rf submodules 2>/dev/null || true # Manually init submodules without nested recursion - git submodule update --init --force submodules/haf # HAF scripts are symlinks to hive submodule - init it without recursion @@ -293,7 +293,7 @@ prepare_haf_data: # Clean up any stale submodule state - deinit first, then remove - git submodule deinit -f --all 2>/dev/null || true - sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true - - rm -rf submodules 2>/dev/null || true + - sudo rm -rf submodules 2>/dev/null || rm -rf submodules 2>/dev/null || true # Manually init just the haf submodule without nested submodules - git submodule update --init --force submodules/haf # HAF scripts are symlinks to hive submodule - init it without recursion @@ -492,7 +492,7 @@ sync: # Clean up any stale submodule state - deinit first, then remove git submodule deinit -f --all 2>/dev/null || true sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true - rm -rf submodules 2>/dev/null || true + sudo rm -rf submodules 2>/dev/null || rm -rf submodules 2>/dev/null || true # Manually init all submodules - HAF with its nested hive, plus btracker, hafah, reptracker git submodule update --init --force git -C "$CI_PROJECT_DIR/submodules/haf" submodule update --init hive -- GitLab From ef15e6d47d8a2a0485acb6631a76e239eb14887e Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 02:21:00 -0500 Subject: [PATCH 043/108] Fix: Init nested haf submodules in hafah, btracker, reptracker for sync job The app-setup container in docker-compose mounts these submodules, and their scripts expect the nested haf submodule to be present for HAF helper functions like create_haf_app_role.sh. --- .gitlab-ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1ae671c2..32ce1bc2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -496,6 +496,10 @@ sync: # Manually init all submodules - HAF with its nested hive, plus btracker, hafah, reptracker git submodule update --init --force git -C "$CI_PROJECT_DIR/submodules/haf" submodule update --init hive + # Init nested haf submodule in hafah, btracker, reptracker (they all depend on haf scripts) + git -C "$CI_PROJECT_DIR/submodules/hafah" submodule update --init haf + git -C "$CI_PROJECT_DIR/submodules/btracker" submodule update --init haf + git -C "$CI_PROJECT_DIR/submodules/reptracker" submodule update --init haf echo -e "\e[0Ksection_end:$(date +%s):git\r\e[0K" - | # Ensure HAF replay data is available locally (fetch from NFS if needed) -- GitLab From 7e7345d7f6efe0c46f52f61361c7594e2c6bd213 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 02:39:41 -0500 Subject: [PATCH 044/108] Fix: Use hive/hive ci-base-image per HAF's current configuration HAF ci_image_tag_vars.yml specifies registry.gitlab.syncad.com/hive/hive/ ci-base-image:ubuntu24.04-py3.14-1 (not hive/haf). Updated BUILDER_IMAGE_PATH to match. --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 32ce1bc2..bf0dc3fc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -29,9 +29,9 @@ variables: BLOCK_LOG_SOURCE_DIR_5M: /blockchain/block_log_5m FF_NETWORK_PER_BUILD: 1 PYTEST_NUMBER_OF_PROCESSES: 8 - # uses registry.gitlab.syncad.com/hive/haf/ci-base-image:ubuntu24.04-1 + # uses registry.gitlab.syncad.com/hive/hive/ci-base-image:ubuntu24.04-py3.14-1 (via $TEST_HAF_IMAGE_TAG from HAF) BUILDER_IMAGE_TAG: "$TEST_HAF_IMAGE_TAG" - BUILDER_IMAGE_PATH: "registry.gitlab.syncad.com/hive/haf/ci-base-image${BUILDER_IMAGE_TAG}" + BUILDER_IMAGE_PATH: "registry.gitlab.syncad.com/hive/hive/ci-base-image${BUILDER_IMAGE_TAG}" # HAF submodule commit - must match the 'ref:' in the include section below # This is needed for service containers which can't access dotenv artifacts HAF_COMMIT: "b4225f9d2591195b0e6aadf36bbef921d95f92b9" -- GitLab From fff6702f5603134c8e1524ff008ff3265083e1f8 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 02:41:44 -0500 Subject: [PATCH 045/108] Fix: Recreate submodule directories after sudo cleanup After removing .git/modules/submodules with sudo, git can't create new directories inside it due to permission issues. Now we explicitly mkdir the parent directories with proper user permissions. --- .gitlab-ci.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bf0dc3fc..2d6f69b9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -259,8 +259,9 @@ prepare_haf_image: - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive # Clean up any stale submodule state - deinit first, then remove - git submodule deinit -f --all 2>/dev/null || true - - sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true - - sudo rm -rf submodules 2>/dev/null || rm -rf submodules 2>/dev/null || true + - sudo rm -rf .git/modules/submodules submodules 2>/dev/null || rm -rf .git/modules/submodules submodules 2>/dev/null || true + # Recreate directories with proper permissions for git submodule operations + - mkdir -p .git/modules submodules # Manually init submodules without nested recursion - git submodule update --init --force submodules/haf # HAF scripts are symlinks to hive submodule - init it without recursion @@ -491,8 +492,9 @@ sync: git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf/hive" # Clean up any stale submodule state - deinit first, then remove git submodule deinit -f --all 2>/dev/null || true - sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true - sudo rm -rf submodules 2>/dev/null || rm -rf submodules 2>/dev/null || true + sudo rm -rf .git/modules/submodules submodules 2>/dev/null || rm -rf .git/modules/submodules submodules 2>/dev/null || true + # Recreate directories with proper permissions for git submodule operations + mkdir -p .git/modules submodules # Manually init all submodules - HAF with its nested hive, plus btracker, hafah, reptracker git submodule update --init --force git -C "$CI_PROJECT_DIR/submodules/haf" submodule update --init hive -- GitLab From c2a3af0d4eae314898729a2c5cb2fc9361b813f1 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 03:01:53 -0500 Subject: [PATCH 046/108] Fix: Clean nested submodule state and use recursive for pattern-test - pre_get_sources: Remove nested submodule modules/ directories to prevent 'transport file not allowed' errors from stale file:// URL configs - pattern-test: Use GIT_SUBMODULE_STRATEGY: recursive since it needs the nested haf/hive submodule for hive-local-tools --- .gitlab-ci.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2d6f69b9..18d4b156 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -98,6 +98,17 @@ default: done fi + # Clean nested submodule configs with file:// URLs (causes 'transport file not allowed' errors) + # This happens when workspaces have stale submodule state from previous runs + if [ -d ".git/modules/submodules" ]; then + for nested_modules in .git/modules/submodules/*/modules; do + if [ -d "$nested_modules" ]; then + echo "pre_get_sources: removing nested submodule state: $nested_modules" + rm -rf "$nested_modules" + fi + done + fi + # Handle directory-to-submodule transitions: fetch target ref's .gitmodules # and remove any paths that exist as regular directories (not submodules) if [ -n "$CI_COMMIT_REF_NAME" ]; then @@ -844,6 +855,8 @@ pattern-test: - *haf-instance-with-nfs-fallback - *postgrest-service variables: + # Need recursive for nested haf/hive submodule (hive-local-tools) + GIT_SUBMODULE_STRATEGY: recursive JUNIT_REPORT: $CI_PROJECT_DIR/tests/tavern/report.xml PYTEST_BASED_IMAGE_NAME: $BUILDER_IMAGE_PATH POETRY_INSTALL_ROOT_DIR: $CI_PROJECT_DIR/submodules/haf/hive/tests/python/hive-local-tools -- GitLab From 9a170b9041c3f08f6705f9421723e17f73e3ca5f Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 03:21:03 -0500 Subject: [PATCH 047/108] Fix: Use recursive submodule strategy for all HAF-dependent test jobs GitLab's 'normal' strategy fails when entering submodules that have nested submodules (like haf/hive). Using 'recursive' properly handles this. --- .gitlab-ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 18d4b156..e4daedb2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -729,6 +729,8 @@ regression-test: when: never - when: on_success variables: + # Use recursive to properly handle nested submodules (haf/hive) + GIT_SUBMODULE_STRATEGY: recursive HAF_APP_SCHEMA: "hafbe_app" services: - *haf-instance-with-nfs-fallback @@ -772,6 +774,8 @@ setup-scripts-test: when: never - when: on_success variables: + # Use recursive to properly handle nested submodules (haf/hive) + GIT_SUBMODULE_STRATEGY: recursive HAF_APP_SCHEMA: "hafbe_app" services: - *haf-instance-with-nfs-fallback @@ -807,6 +811,8 @@ performance-test: when: never - when: on_success variables: + # Use recursive to properly handle nested submodules (haf/hive) + GIT_SUBMODULE_STRATEGY: recursive HAF_APP_SCHEMA: "hafbe_app" services: - *haf-instance-with-nfs-fallback -- GitLab From 74ba3e792c5449fbae94139241b66a5b7d0f9c66 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 03:39:52 -0500 Subject: [PATCH 048/108] Fix: Create .git/modules/submodules directory before submodule init mkdir -p .git/modules submodules creates two separate directories: .git/modules and submodules. This prevents git from creating .git/modules/submodules/haf during submodule init. Changed to mkdir -p .git/modules/submodules submodules to create the proper directory tree. --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e4daedb2..4f5d0943 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -272,7 +272,7 @@ prepare_haf_image: - git submodule deinit -f --all 2>/dev/null || true - sudo rm -rf .git/modules/submodules submodules 2>/dev/null || rm -rf .git/modules/submodules submodules 2>/dev/null || true # Recreate directories with proper permissions for git submodule operations - - mkdir -p .git/modules submodules + - mkdir -p .git/modules/submodules submodules # Manually init submodules without nested recursion - git submodule update --init --force submodules/haf # HAF scripts are symlinks to hive submodule - init it without recursion @@ -505,7 +505,7 @@ sync: git submodule deinit -f --all 2>/dev/null || true sudo rm -rf .git/modules/submodules submodules 2>/dev/null || rm -rf .git/modules/submodules submodules 2>/dev/null || true # Recreate directories with proper permissions for git submodule operations - mkdir -p .git/modules submodules + mkdir -p .git/modules/submodules submodules # Manually init all submodules - HAF with its nested hive, plus btracker, hafah, reptracker git submodule update --init --force git -C "$CI_PROJECT_DIR/submodules/haf" submodule update --init hive -- GitLab From 4e671d95cbd378171667d2afc29cc1a7c1ed07cd Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 03:57:58 -0500 Subject: [PATCH 049/108] Fix nested submodule cleanup to use recursive find The glob pattern .git/modules/submodules/*/modules only cleaned one level deep. Changed to use find to recursively remove ALL modules directories at any depth, fixing errors like: fatal: not a git repository: haf/../../../.git/modules/submodules/btracker/modules/haf --- .gitlab-ci.yml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4f5d0943..ce7e4b27 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -100,13 +100,10 @@ default: # Clean nested submodule configs with file:// URLs (causes 'transport file not allowed' errors) # This happens when workspaces have stale submodule state from previous runs + # Use find to recursively remove ALL nested modules directories at any depth if [ -d ".git/modules/submodules" ]; then - for nested_modules in .git/modules/submodules/*/modules; do - if [ -d "$nested_modules" ]; then - echo "pre_get_sources: removing nested submodule state: $nested_modules" - rm -rf "$nested_modules" - fi - done + echo "pre_get_sources: removing all nested submodule state under .git/modules/submodules" + find .git/modules/submodules -type d -name "modules" -exec rm -rf {} + 2>/dev/null || true fi # Handle directory-to-submodule transitions: fetch target ref's .gitmodules -- GitLab From b011cf5ae150b240ef13fcba8b013c2af8740c88 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 04:03:03 -0500 Subject: [PATCH 050/108] Fix: Use sudo to create submodule directories after sudo cleanup After sudo rm -rf removes the submodule directories, mkdir fails with permission denied because the parent directories may now be root-owned or the directory removal left the filesystem in an inconsistent state. Fix by using sudo mkdir followed by sudo chown to restore proper ownership for the current user, allowing git submodule operations to proceed. --- .gitlab-ci.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ce7e4b27..5a759ee6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -267,9 +267,9 @@ prepare_haf_image: - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive # Clean up any stale submodule state - deinit first, then remove - git submodule deinit -f --all 2>/dev/null || true + # Remove with sudo, then recreate with sudo and fix ownership so git can use them - sudo rm -rf .git/modules/submodules submodules 2>/dev/null || rm -rf .git/modules/submodules submodules 2>/dev/null || true - # Recreate directories with proper permissions for git submodule operations - - mkdir -p .git/modules/submodules submodules + - sudo mkdir -p .git/modules/submodules submodules && sudo chown -R $(id -u):$(id -g) .git/modules/submodules submodules # Manually init submodules without nested recursion - git submodule update --init --force submodules/haf # HAF scripts are symlinks to hive submodule - init it without recursion @@ -301,8 +301,9 @@ prepare_haf_data: - git config --global --add safe.directory $CI_PROJECT_DIR/submodules/haf/hive # Clean up any stale submodule state - deinit first, then remove - git submodule deinit -f --all 2>/dev/null || true - - sudo rm -rf .git/modules/submodules 2>/dev/null || rm -rf .git/modules/submodules 2>/dev/null || true - - sudo rm -rf submodules 2>/dev/null || rm -rf submodules 2>/dev/null || true + # Remove with sudo, then recreate with sudo and fix ownership so git can use them + - sudo rm -rf .git/modules/submodules submodules 2>/dev/null || rm -rf .git/modules/submodules submodules 2>/dev/null || true + - sudo mkdir -p .git/modules/submodules submodules && sudo chown -R $(id -u):$(id -g) .git/modules/submodules submodules # Manually init just the haf submodule without nested submodules - git submodule update --init --force submodules/haf # HAF scripts are symlinks to hive submodule - init it without recursion @@ -500,9 +501,9 @@ sync: git config --global --add safe.directory "$CI_PROJECT_DIR/submodules/haf/hive" # Clean up any stale submodule state - deinit first, then remove git submodule deinit -f --all 2>/dev/null || true + # Remove with sudo, then recreate with sudo and fix ownership so git can use them sudo rm -rf .git/modules/submodules submodules 2>/dev/null || rm -rf .git/modules/submodules submodules 2>/dev/null || true - # Recreate directories with proper permissions for git submodule operations - mkdir -p .git/modules/submodules submodules + sudo mkdir -p .git/modules/submodules submodules && sudo chown -R $(id -u):$(id -g) .git/modules/submodules submodules # Manually init all submodules - HAF with its nested hive, plus btracker, hafah, reptracker git submodule update --init --force git -C "$CI_PROJECT_DIR/submodules/haf" submodule update --init hive -- GitLab From 7853d3a8e1886bb9b0601c1ca1e5e548d0ab5644 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 04:30:28 -0500 Subject: [PATCH 051/108] Fix: Download cache-manager.sh in after_script if submodule missing The sync job's before_script cleans submodules with 'git submodule deinit' and 'sudo rm -rf submodules', but the after_script tries to use cache-manager.sh from submodules/haf to push data to NFS. Since the submodule is cleaned, the file doesn't exist, causing the NFS push to be silently skipped. This means test jobs on other runners can't find the sync data. Fix by downloading cache-manager.sh from common-ci-configuration if the submodule copy is not available. --- .gitlab-ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5a759ee6..79745de4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -583,7 +583,13 @@ sync: ls -lah "${DATADIR}" ls -lah "${LOCAL_HAFBE_CACHE}" || true + # Download cache-manager if not available in submodule (may have been cleaned) CACHE_MANAGER="${CI_PROJECT_DIR}/submodules/haf/scripts/ci-helpers/cache-manager.sh" + if [[ ! -x "$CACHE_MANAGER" ]]; then + CACHE_MANAGER="/tmp/cache-manager.sh" + curl -fsSL "https://gitlab.syncad.com/hive/common-ci-configuration/-/raw/develop/scripts/cache-manager.sh" -o "$CACHE_MANAGER" + chmod +x "$CACHE_MANAGER" + fi if [[ -x "$CACHE_MANAGER" ]]; then echo "Pushing sync data to NFS cache: ${HAFBE_SYNC_CACHE_TYPE}/${HAFBE_CACHE_KEY}" CACHE_HANDLING=haf "$CACHE_MANAGER" put "${HAFBE_SYNC_CACHE_TYPE}" "${HAFBE_CACHE_KEY}" "${LOCAL_HAFBE_CACHE}" || echo "Warning: Failed to push to NFS cache" -- GitLab From 1afc638686503149a2e204fd0792e7a73b788bc7 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 04:40:24 -0500 Subject: [PATCH 052/108] Fix nested submodule cleanup to use recursive find The previous find command with -exec rm -rf {} + could fail when processing nested paths because find might try to process children after their parent was already removed. Switch to a simpler loop that removes the 'modules' subdirectory from each top-level submodule (btracker/modules, hafah/modules, etc.), which handles the nested haf submodule cleanly. --- .gitlab-ci.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 79745de4..fefb8175 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -100,10 +100,13 @@ default: # Clean nested submodule configs with file:// URLs (causes 'transport file not allowed' errors) # This happens when workspaces have stale submodule state from previous runs - # Use find to recursively remove ALL nested modules directories at any depth + # Remove nested modules directories - loop to handle any depth if [ -d ".git/modules/submodules" ]; then echo "pre_get_sources: removing all nested submodule state under .git/modules/submodules" - find .git/modules/submodules -type d -name "modules" -exec rm -rf {} + 2>/dev/null || true + # Remove any 'modules' subdirectory within each top-level submodule (e.g., btracker/modules, hafah/modules) + for submod_dir in .git/modules/submodules/*/; do + [ -d "${submod_dir}modules" ] && rm -rf "${submod_dir}modules" + done fi # Handle directory-to-submodule transitions: fetch target ref's .gitmodules -- GitLab From ca74c01854641f598f9f9c7cd3929e7a6c29a0de Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 04:53:55 -0500 Subject: [PATCH 053/108] Fix: Use BUILDER_IMAGE for performance-test (needs python3) The performance-test job calls generate_db.py which requires python3, but the ci-runner:docker-26.1.4-1 image doesn't have Python installed. Switch to BUILDER_IMAGE_PATH which includes Python 3.14. --- .gitlab-ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index fefb8175..2f9e0801 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -800,7 +800,8 @@ setup-scripts-test: performance-test: extends: .wait-for-haf-postgres - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 + # Use BUILDER_IMAGE with Python (ci-runner doesn't have python3 needed for generate_db.py) + image: $BUILDER_IMAGE_PATH stage: test needs: - job: quick_test_setup -- GitLab From 8da0e9d5f0719444cdba953742fcab8db06c4b6c Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 05:05:27 -0500 Subject: [PATCH 054/108] Fix nested submodule cleanup to use find with -exec {} \; The for loop with bash glob wasn't reliably cleaning nested modules directories. Switch to find with -mindepth 2 -maxdepth 2 to target exactly the nested modules directories (e.g., btracker/modules, hafah/modules) and use -exec {} \; to remove each one individually, avoiding the batching issues with -exec {} +. --- .gitlab-ci.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2f9e0801..71f62bc3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -100,13 +100,11 @@ default: # Clean nested submodule configs with file:// URLs (causes 'transport file not allowed' errors) # This happens when workspaces have stale submodule state from previous runs - # Remove nested modules directories - loop to handle any depth + # Remove nested modules directories at any depth using find (more reliable than glob) if [ -d ".git/modules/submodules" ]; then echo "pre_get_sources: removing all nested submodule state under .git/modules/submodules" - # Remove any 'modules' subdirectory within each top-level submodule (e.g., btracker/modules, hafah/modules) - for submod_dir in .git/modules/submodules/*/; do - [ -d "${submod_dir}modules" ] && rm -rf "${submod_dir}modules" - done + # Use find to locate and remove all 'modules' directories (nested submodule state) + find .git/modules/submodules -mindepth 2 -maxdepth 2 -type d -name "modules" -exec rm -rf {} \; 2>/dev/null || true fi # Handle directory-to-submodule transitions: fetch target ref's .gitmodules -- GitLab From d6ac91e9e68f4aa79a91eb22c5876f43a270e9f5 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 05:16:59 -0500 Subject: [PATCH 055/108] Fix nested submodule cleanup with direct rm -rf commands Replace find command with direct rm -rf on known nested module paths. The find command with -exec wasn't reliably removing the directories. Explicitly remove modules/ subdirectory from each top-level submodule: - btracker/modules - hafah/modules - reptracker/modules - haf/modules --- .gitlab-ci.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 71f62bc3..ba5e44c8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -100,11 +100,14 @@ default: # Clean nested submodule configs with file:// URLs (causes 'transport file not allowed' errors) # This happens when workspaces have stale submodule state from previous runs - # Remove nested modules directories at any depth using find (more reliable than glob) + # Remove nested modules directories directly (btracker/modules, hafah/modules, etc.) if [ -d ".git/modules/submodules" ]; then echo "pre_get_sources: removing all nested submodule state under .git/modules/submodules" - # Use find to locate and remove all 'modules' directories (nested submodule state) - find .git/modules/submodules -mindepth 2 -maxdepth 2 -type d -name "modules" -exec rm -rf {} \; 2>/dev/null || true + # Direct removal of known nested module paths - more reliable than find + rm -rf .git/modules/submodules/btracker/modules 2>/dev/null || true + rm -rf .git/modules/submodules/hafah/modules 2>/dev/null || true + rm -rf .git/modules/submodules/reptracker/modules 2>/dev/null || true + rm -rf .git/modules/submodules/haf/modules 2>/dev/null || true fi # Handle directory-to-submodule transitions: fetch target ref's .gitmodules -- GitLab From 1e63a5cf190334e2f7f689a489ac206f667ea38e Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 05:29:44 -0500 Subject: [PATCH 056/108] Fix nested submodule cleanup: also remove stale working tree dirs The previous fix only removed .git/modules/submodules/*/modules/ dirs, but the working tree submodule directories (e.g., submodules/btracker/haf) still had .git files pointing to the now-deleted modules paths. When git tried to recurse into these nested submodules, it found the stale .git file pointing to a non-existent directory. Fix by also removing the nested submodule working directories: - submodules/btracker/haf - submodules/hafah/haf - submodules/reptracker/haf - submodules/haf/hive --- .gitlab-ci.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ba5e44c8..104b1d69 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -100,14 +100,20 @@ default: # Clean nested submodule configs with file:// URLs (causes 'transport file not allowed' errors) # This happens when workspaces have stale submodule state from previous runs - # Remove nested modules directories directly (btracker/modules, hafah/modules, etc.) + # Remove both the .git/modules nested dirs AND the working tree nested submodule .git files if [ -d ".git/modules/submodules" ]; then echo "pre_get_sources: removing all nested submodule state under .git/modules/submodules" - # Direct removal of known nested module paths - more reliable than find + # Remove nested modules directories in .git/modules rm -rf .git/modules/submodules/btracker/modules 2>/dev/null || true rm -rf .git/modules/submodules/hafah/modules 2>/dev/null || true rm -rf .git/modules/submodules/reptracker/modules 2>/dev/null || true rm -rf .git/modules/submodules/haf/modules 2>/dev/null || true + # Also remove nested submodule working directories that have stale .git files + # These point to the now-deleted modules directories + rm -rf submodules/btracker/haf 2>/dev/null || true + rm -rf submodules/hafah/haf 2>/dev/null || true + rm -rf submodules/reptracker/haf 2>/dev/null || true + rm -rf submodules/haf/hive 2>/dev/null || true fi # Handle directory-to-submodule transitions: fetch target ref's .gitmodules -- GitLab From 5b8056aa9284ca5a22fafcba42cc2379195e12a3 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 05:53:47 -0500 Subject: [PATCH 057/108] Fix NFS cache fallback: extract cache in main job for service container Service containers may not have NFS mounted despite runner config. Solution: 1. Service container waits for cache to appear if NFS not accessible 2. Main job's before_script extracts NFS cache to local /cache/ 3. New .wait-for-haf-postgres-with-nfs extension handles extraction 4. Test jobs use new extension to ensure cache is ready for service Also adds better debug output to diagnose NFS access issues. --- .gitlab-ci.yml | 73 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 104b1d69..f3377059 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -649,17 +649,23 @@ sync: NFS_TYPE="${DATA_SOURCE_NFS_TYPE:-haf_sync}" NFS_KEY="${DATA_SOURCE_NFS_KEY}" - # If original path doesn't exist, try NFS fallback + # If original path doesn't exist, try NFS fallback or wait for main job extraction if [[ ! -d "${ORIGINAL_SOURCE}/datadir" ]]; then echo "Local cache not found: ${ORIGINAL_SOURCE}/datadir" NFS_PATH="${NFS_PREFIX}/${NFS_TYPE}/${NFS_KEY}" NFS_TAR="${NFS_PATH}.tar" + # Debug: show what we're looking for + echo "NFS_PREFIX=${NFS_PREFIX} NFS_TYPE=${NFS_TYPE} NFS_KEY=${NFS_KEY}" + echo "Checking NFS path: ${NFS_PATH}" + echo "Checking NFS tar: ${NFS_TAR}" + ls -la "${NFS_PREFIX}/${NFS_TYPE}/" 2>&1 | head -5 || echo "Cannot list NFS directory (NFS may not be mounted in container)" + if [[ -d "${NFS_PATH}/datadir" ]]; then echo "Found data on NFS directory: $NFS_PATH" export DATA_SOURCE="$NFS_PATH" elif [[ -f "${NFS_TAR}" ]]; then - echo "Found NFS tar archive: $NFS_TAR" + echo "Found NFS tar archive: $NFS_TAR ($(stat -c%s "$NFS_TAR" 2>/dev/null || echo 'size unknown') bytes)" echo "Extracting to: ${ORIGINAL_SOURCE}" mkdir -p "${ORIGINAL_SOURCE}" tar xf "${NFS_TAR}" -C "${ORIGINAL_SOURCE}" @@ -674,10 +680,25 @@ sync: echo "Extracted NFS cache successfully" export DATA_SOURCE="${ORIGINAL_SOURCE}" else - echo "WARNING: Data not found in local or NFS cache" - echo "Checked: ${ORIGINAL_SOURCE}/datadir" - echo "Checked: ${NFS_PATH}/datadir" - echo "Checked: ${NFS_TAR}" + # NFS not available in this container - wait for main job to extract cache + # The main job's before_script will extract NFS cache to local /cache/ path + echo "NFS tar not found - waiting for main job to extract cache..." + echo "Expected location: ${ORIGINAL_SOURCE}/datadir" + WAIT_TIMEOUT=300 + ELAPSED=0 + while [[ ! -d "${ORIGINAL_SOURCE}/datadir" ]]; do + if [[ $ELAPSED -ge $WAIT_TIMEOUT ]]; then + echo "ERROR: Timeout waiting for cache extraction" + echo "Main job should have extracted NFS cache to: ${ORIGINAL_SOURCE}" + ls -la /cache/ 2>&1 | head -10 || echo "Cannot list /cache/" + exit 1 + fi + sleep 5 + ELAPSED=$((ELAPSED + 5)) + echo "Waiting for cache... (${ELAPSED}s/${WAIT_TIMEOUT}s)" + done + echo "Cache appeared after ${ELAPSED}s" + export DATA_SOURCE="${ORIGINAL_SOURCE}" fi else echo "Using local cache: ${ORIGINAL_SOURCE}" @@ -702,6 +723,38 @@ sync: PGRST_DB_EXTRA_SEARCH_PATH: hafbe_bal, reptracker_app HEALTHCHECK_TCP_PORT: 3000 +# Extension that extracts NFS cache before waiting for PostgreSQL +# Service container waits for this extraction to complete +.wait-for-haf-postgres-with-nfs: + extends: .wait-for-haf-postgres + before_script: + - | + # Extract NFS cache to local /cache/ so service container can find it + LOCAL_CACHE="${DATA_CACHE_HAF_PREFIX}_${HAFBE_CACHE_KEY}" + NFS_TAR="${DATA_CACHE_NFS_PREFIX}/${HAFBE_SYNC_CACHE_TYPE}/${HAFBE_CACHE_KEY}.tar" + echo "Checking cache: LOCAL_CACHE=${LOCAL_CACHE}, NFS_TAR=${NFS_TAR}" + if [[ ! -d "${LOCAL_CACHE}/datadir" ]]; then + if [[ -f "$NFS_TAR" ]]; then + echo "Extracting NFS cache for service container..." + mkdir -p "${LOCAL_CACHE}" + tar xf "$NFS_TAR" -C "${LOCAL_CACHE}" + # Restore pgdata ownership and permissions for PostgreSQL + if [[ -d "${LOCAL_CACHE}/datadir/haf_db_store" ]]; then + echo "Fixing PostgreSQL data permissions..." + sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_db_store" + sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true + sudo chmod 700 "${LOCAL_CACHE}/datadir/haf_db_store/pgdata" + fi + echo "NFS cache extracted to: ${LOCAL_CACHE}" + else + echo "WARNING: NFS cache not found: $NFS_TAR" + ls -la "${DATA_CACHE_NFS_PREFIX}/${HAFBE_SYNC_CACHE_TYPE}/" 2>&1 | head -5 || echo "Cannot list NFS directory" + fi + else + echo "Local cache already exists: ${LOCAL_CACHE}" + fi + - !reference [.wait-for-haf-postgres, before_script] + python_api_client_test: extends: .project_develop_configuration_template # Use Python 3.12 for consistency with generate_python_api_client @@ -724,7 +777,7 @@ python_api_client_test: - public-runner-docker regression-test: - extends: .wait-for-haf-postgres + extends: .wait-for-haf-postgres-with-nfs image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 stage: test needs: @@ -769,7 +822,7 @@ regression-test: - fast setup-scripts-test: - extends: .wait-for-haf-postgres + extends: .wait-for-haf-postgres-with-nfs image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 stage: test needs: @@ -806,7 +859,7 @@ setup-scripts-test: - fast performance-test: - extends: .wait-for-haf-postgres + extends: .wait-for-haf-postgres-with-nfs # Use BUILDER_IMAGE with Python (ci-runner doesn't have python3 needed for generate_db.py) image: $BUILDER_IMAGE_PATH stage: test @@ -887,7 +940,7 @@ pattern-test: HAF_APP_SCHEMA: "hafbe_app" before_script: - !reference [.pytest_based_template, before_script] - - !reference [.wait-for-haf-postgres, before_script] + - !reference [.wait-for-haf-postgres-with-nfs, before_script] script: - | cd $CI_PROJECT_DIR/tests/tavern -- GitLab From 8caa91c39345f570b476d922014f2de41169e8e5 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 06:13:24 -0500 Subject: [PATCH 058/108] Fix regression-test: use BUILDER_IMAGE for python3 support The accounts_dump_test.sh script requires python3 for data_insertion_script.py. The ci-runner image doesn't have python3, but BUILDER_IMAGE does. --- .gitlab-ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f3377059..5b1de934 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -778,7 +778,8 @@ python_api_client_test: regression-test: extends: .wait-for-haf-postgres-with-nfs - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 + # Use BUILDER_IMAGE with Python (ci-runner doesn't have python3 needed for data_insertion_script.py) + image: $BUILDER_IMAGE_PATH stage: test needs: - job: quick_test_setup -- GitLab From f53d6f9c4ebd78415dd16f81afcb1e0dfd30d1a6 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 06:34:43 -0500 Subject: [PATCH 059/108] Fix NFS cache race: wait for complete extraction before PostgreSQL check When service container and main job race to extract NFS cache, the main job saw 'datadir exists' but extraction was incomplete. Now we check for pgdata/PG_VERSION as the completion marker and wait if extraction is in progress. --- .gitlab-ci.yml | 46 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5b1de934..b8a5b3c5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -730,28 +730,50 @@ sync: before_script: - | # Extract NFS cache to local /cache/ so service container can find it + # Check for pgdata (not just datadir) to ensure extraction is complete LOCAL_CACHE="${DATA_CACHE_HAF_PREFIX}_${HAFBE_CACHE_KEY}" NFS_TAR="${DATA_CACHE_NFS_PREFIX}/${HAFBE_SYNC_CACHE_TYPE}/${HAFBE_CACHE_KEY}.tar" + PGDATA_DIR="${LOCAL_CACHE}/datadir/haf_db_store/pgdata" echo "Checking cache: LOCAL_CACHE=${LOCAL_CACHE}, NFS_TAR=${NFS_TAR}" - if [[ ! -d "${LOCAL_CACHE}/datadir" ]]; then + # Wait for complete extraction (check for pgdata/PG_VERSION as completion marker) + if [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; then + echo "Cache not ready (pgdata/PG_VERSION missing), checking NFS..." if [[ -f "$NFS_TAR" ]]; then - echo "Extracting NFS cache for service container..." - mkdir -p "${LOCAL_CACHE}" - tar xf "$NFS_TAR" -C "${LOCAL_CACHE}" - # Restore pgdata ownership and permissions for PostgreSQL - if [[ -d "${LOCAL_CACHE}/datadir/haf_db_store" ]]; then - echo "Fixing PostgreSQL data permissions..." - sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_db_store" - sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true - sudo chmod 700 "${LOCAL_CACHE}/datadir/haf_db_store/pgdata" + # Check if service is already extracting (datadir exists but pgdata incomplete) + if [[ -d "${LOCAL_CACHE}/datadir" ]] && [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; then + echo "Extraction in progress by service container, waiting for completion..." + WAIT_TIMEOUT=300 + ELAPSED=0 + while [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; do + if [[ $ELAPSED -ge $WAIT_TIMEOUT ]]; then + echo "ERROR: Timeout waiting for cache extraction to complete" + ls -la "${LOCAL_CACHE}/datadir/" 2>&1 || true + exit 1 + fi + sleep 5 + ELAPSED=$((ELAPSED + 5)) + echo "Waiting for extraction... (${ELAPSED}s/${WAIT_TIMEOUT}s)" + done + echo "Extraction completed after ${ELAPSED}s" + else + echo "Extracting NFS cache for service container..." + mkdir -p "${LOCAL_CACHE}" + tar xf "$NFS_TAR" -C "${LOCAL_CACHE}" + # Restore pgdata ownership and permissions for PostgreSQL + if [[ -d "${LOCAL_CACHE}/datadir/haf_db_store" ]]; then + echo "Fixing PostgreSQL data permissions..." + sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_db_store" + sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true + sudo chmod 700 "${LOCAL_CACHE}/datadir/haf_db_store/pgdata" + fi + echo "NFS cache extracted to: ${LOCAL_CACHE}" fi - echo "NFS cache extracted to: ${LOCAL_CACHE}" else echo "WARNING: NFS cache not found: $NFS_TAR" ls -la "${DATA_CACHE_NFS_PREFIX}/${HAFBE_SYNC_CACHE_TYPE}/" 2>&1 | head -5 || echo "Cannot list NFS directory" fi else - echo "Local cache already exists: ${LOCAL_CACHE}" + echo "Local cache ready: ${LOCAL_CACHE}" fi - !reference [.wait-for-haf-postgres, before_script] -- GitLab From 19ed852a33595e6047ff58cc4c44f88cc15cc70e Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 06:51:23 -0500 Subject: [PATCH 060/108] Fix NFS cache extraction: main job extracts instead of waiting for service The service container's NFS extraction was timing out (17GB tar). Changed approach so main job always does the extraction: - Main job extracts NFS cache directly (has faster NFS access) - If partial extraction found, removes and re-extracts - Service container waits for PG_VERSION completion marker - Increased service wait timeout to 600s for extraction time --- .gitlab-ci.yml | 74 +++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 43 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b8a5b3c5..6b3aa90c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -682,22 +682,24 @@ sync: else # NFS not available in this container - wait for main job to extract cache # The main job's before_script will extract NFS cache to local /cache/ path + # Wait for PG_VERSION as completion marker (main job may remove partial extractions) + PGDATA_DIR="${ORIGINAL_SOURCE}/datadir/haf_db_store/pgdata" echo "NFS tar not found - waiting for main job to extract cache..." - echo "Expected location: ${ORIGINAL_SOURCE}/datadir" - WAIT_TIMEOUT=300 + echo "Waiting for completion marker: ${PGDATA_DIR}/PG_VERSION" + WAIT_TIMEOUT=600 ELAPSED=0 - while [[ ! -d "${ORIGINAL_SOURCE}/datadir" ]]; do + while [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; do if [[ $ELAPSED -ge $WAIT_TIMEOUT ]]; then echo "ERROR: Timeout waiting for cache extraction" echo "Main job should have extracted NFS cache to: ${ORIGINAL_SOURCE}" - ls -la /cache/ 2>&1 | head -10 || echo "Cannot list /cache/" + ls -la "${ORIGINAL_SOURCE}/" 2>&1 | head -10 || echo "Cannot list cache dir" exit 1 fi sleep 5 ELAPSED=$((ELAPSED + 5)) - echo "Waiting for cache... (${ELAPSED}s/${WAIT_TIMEOUT}s)" + echo "Waiting for cache extraction... (${ELAPSED}s/${WAIT_TIMEOUT}s)" done - echo "Cache appeared after ${ELAPSED}s" + echo "Cache ready after ${ELAPSED}s" export DATA_SOURCE="${ORIGINAL_SOURCE}" fi else @@ -735,45 +737,31 @@ sync: NFS_TAR="${DATA_CACHE_NFS_PREFIX}/${HAFBE_SYNC_CACHE_TYPE}/${HAFBE_CACHE_KEY}.tar" PGDATA_DIR="${LOCAL_CACHE}/datadir/haf_db_store/pgdata" echo "Checking cache: LOCAL_CACHE=${LOCAL_CACHE}, NFS_TAR=${NFS_TAR}" - # Wait for complete extraction (check for pgdata/PG_VERSION as completion marker) - if [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; then - echo "Cache not ready (pgdata/PG_VERSION missing), checking NFS..." - if [[ -f "$NFS_TAR" ]]; then - # Check if service is already extracting (datadir exists but pgdata incomplete) - if [[ -d "${LOCAL_CACHE}/datadir" ]] && [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; then - echo "Extraction in progress by service container, waiting for completion..." - WAIT_TIMEOUT=300 - ELAPSED=0 - while [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; do - if [[ $ELAPSED -ge $WAIT_TIMEOUT ]]; then - echo "ERROR: Timeout waiting for cache extraction to complete" - ls -la "${LOCAL_CACHE}/datadir/" 2>&1 || true - exit 1 - fi - sleep 5 - ELAPSED=$((ELAPSED + 5)) - echo "Waiting for extraction... (${ELAPSED}s/${WAIT_TIMEOUT}s)" - done - echo "Extraction completed after ${ELAPSED}s" - else - echo "Extracting NFS cache for service container..." - mkdir -p "${LOCAL_CACHE}" - tar xf "$NFS_TAR" -C "${LOCAL_CACHE}" - # Restore pgdata ownership and permissions for PostgreSQL - if [[ -d "${LOCAL_CACHE}/datadir/haf_db_store" ]]; then - echo "Fixing PostgreSQL data permissions..." - sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_db_store" - sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true - sudo chmod 700 "${LOCAL_CACHE}/datadir/haf_db_store/pgdata" - fi - echo "NFS cache extracted to: ${LOCAL_CACHE}" - fi - else - echo "WARNING: NFS cache not found: $NFS_TAR" - ls -la "${DATA_CACHE_NFS_PREFIX}/${HAFBE_SYNC_CACHE_TYPE}/" 2>&1 | head -5 || echo "Cannot list NFS directory" + # Check for complete extraction (pgdata/PG_VERSION as completion marker) + if [[ -f "${PGDATA_DIR}/PG_VERSION" ]]; then + echo "Local cache ready: ${LOCAL_CACHE}" + elif [[ -f "$NFS_TAR" ]]; then + echo "Cache not ready (pgdata/PG_VERSION missing), will extract from NFS..." + # If partial extraction exists, remove it and start fresh + # (service container may have started but main job is faster with direct NFS access) + if [[ -d "${LOCAL_CACHE}/datadir" ]]; then + echo "Partial extraction found, removing and re-extracting..." + sudo rm -rf "${LOCAL_CACHE}/datadir" fi + echo "Extracting NFS cache for service container..." + mkdir -p "${LOCAL_CACHE}" + tar xf "$NFS_TAR" -C "${LOCAL_CACHE}" + # Restore pgdata ownership and permissions for PostgreSQL + if [[ -d "${LOCAL_CACHE}/datadir/haf_db_store" ]]; then + echo "Fixing PostgreSQL data permissions..." + sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_db_store" + sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true + sudo chmod 700 "${LOCAL_CACHE}/datadir/haf_db_store/pgdata" + fi + echo "NFS cache extracted to: ${LOCAL_CACHE}" else - echo "Local cache ready: ${LOCAL_CACHE}" + echo "WARNING: NFS cache not found: $NFS_TAR" + ls -la "${DATA_CACHE_NFS_PREFIX}/${HAFBE_SYNC_CACHE_TYPE}/" 2>&1 | head -5 || echo "Cannot list NFS directory" fi - !reference [.wait-for-haf-postgres, before_script] -- GitLab From f64e1641b56ae33ceb35e882476f67bcb875e580 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 07:02:38 -0500 Subject: [PATCH 061/108] Fix NFS cache extraction permission: use sudo for tar The tar extraction failed with 'Permission denied' when trying to create datadir after removing partial extraction. Use sudo for both mkdir and tar to ensure proper permissions during cache extraction. --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6b3aa90c..bc5ff9fe 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -749,8 +749,8 @@ sync: sudo rm -rf "${LOCAL_CACHE}/datadir" fi echo "Extracting NFS cache for service container..." - mkdir -p "${LOCAL_CACHE}" - tar xf "$NFS_TAR" -C "${LOCAL_CACHE}" + sudo mkdir -p "${LOCAL_CACHE}" + sudo tar xf "$NFS_TAR" -C "${LOCAL_CACHE}" # Restore pgdata ownership and permissions for PostgreSQL if [[ -d "${LOCAL_CACHE}/datadir/haf_db_store" ]]; then echo "Fixing PostgreSQL data permissions..." -- GitLab From a94064efd6c88734a9c08b2a97855f727ff686ea Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 07:20:09 -0500 Subject: [PATCH 062/108] Fix NFS cache race: wait for service if rm fails When trying to remove partial extraction, if rm -rf fails (because service container has files open), wait for service to complete instead of erroring out. Uses 600s timeout for large cache extraction. --- .gitlab-ci.yml | 59 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bc5ff9fe..81789af0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -742,23 +742,52 @@ sync: echo "Local cache ready: ${LOCAL_CACHE}" elif [[ -f "$NFS_TAR" ]]; then echo "Cache not ready (pgdata/PG_VERSION missing), will extract from NFS..." - # If partial extraction exists, remove it and start fresh - # (service container may have started but main job is faster with direct NFS access) + # If partial extraction exists, try to remove it (service container may be extracting) if [[ -d "${LOCAL_CACHE}/datadir" ]]; then - echo "Partial extraction found, removing and re-extracting..." - sudo rm -rf "${LOCAL_CACHE}/datadir" - fi - echo "Extracting NFS cache for service container..." - sudo mkdir -p "${LOCAL_CACHE}" - sudo tar xf "$NFS_TAR" -C "${LOCAL_CACHE}" - # Restore pgdata ownership and permissions for PostgreSQL - if [[ -d "${LOCAL_CACHE}/datadir/haf_db_store" ]]; then - echo "Fixing PostgreSQL data permissions..." - sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_db_store" - sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true - sudo chmod 700 "${LOCAL_CACHE}/datadir/haf_db_store/pgdata" + echo "Partial extraction found, attempting to remove..." + # Try to remove - if it fails (service container has files open), wait for service + if ! sudo rm -rf "${LOCAL_CACHE}/datadir" 2>/dev/null; then + echo "Could not remove (service container extracting), waiting for completion..." + WAIT_TIMEOUT=600 + ELAPSED=0 + while [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; do + if [[ $ELAPSED -ge $WAIT_TIMEOUT ]]; then + echo "ERROR: Timeout waiting for service container extraction" + ls -la "${LOCAL_CACHE}/" 2>&1 | head -10 || true + exit 1 + fi + sleep 10 + ELAPSED=$((ELAPSED + 10)) + echo "Waiting for service extraction... (${ELAPSED}s/${WAIT_TIMEOUT}s)" + done + echo "Service extraction completed after ${ELAPSED}s" + else + echo "Removed partial extraction, will extract fresh..." + echo "Extracting NFS cache for service container..." + sudo mkdir -p "${LOCAL_CACHE}" + sudo tar xf "$NFS_TAR" -C "${LOCAL_CACHE}" + # Restore pgdata ownership and permissions for PostgreSQL + if [[ -d "${LOCAL_CACHE}/datadir/haf_db_store" ]]; then + echo "Fixing PostgreSQL data permissions..." + sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_db_store" + sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true + sudo chmod 700 "${LOCAL_CACHE}/datadir/haf_db_store/pgdata" + fi + echo "NFS cache extracted to: ${LOCAL_CACHE}" + fi + else + echo "No existing extraction, extracting fresh from NFS..." + sudo mkdir -p "${LOCAL_CACHE}" + sudo tar xf "$NFS_TAR" -C "${LOCAL_CACHE}" + # Restore pgdata ownership and permissions for PostgreSQL + if [[ -d "${LOCAL_CACHE}/datadir/haf_db_store" ]]; then + echo "Fixing PostgreSQL data permissions..." + sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_db_store" + sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true + sudo chmod 700 "${LOCAL_CACHE}/datadir/haf_db_store/pgdata" + fi + echo "NFS cache extracted to: ${LOCAL_CACHE}" fi - echo "NFS cache extracted to: ${LOCAL_CACHE}" else echo "WARNING: NFS cache not found: $NFS_TAR" ls -la "${DATA_CACHE_NFS_PREFIX}/${HAFBE_SYNC_CACHE_TYPE}/" 2>&1 | head -5 || echo "Cannot list NFS directory" -- GitLab From 14483a48f1d39a4d679b4b0ed933acacc42b6c8f Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 07:39:38 -0500 Subject: [PATCH 063/108] Fix service container: ensure PostgreSQL permissions when using local cache When service container finds local cache already exists, it was not fixing PostgreSQL data permissions before starting. Added permission fix to ensure pgdata is owned by postgres (UID 105:109) with mode 700. --- .gitlab-ci.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 81789af0..dfce6133 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -704,6 +704,13 @@ sync: fi else echo "Using local cache: ${ORIGINAL_SOURCE}" + # Ensure PostgreSQL data permissions are correct + if [[ -d "${ORIGINAL_SOURCE}/datadir/haf_db_store" ]]; then + echo "Fixing PostgreSQL data permissions..." + sudo chown -R 105:109 "${ORIGINAL_SOURCE}/datadir/haf_db_store" + sudo chown -R 105:109 "${ORIGINAL_SOURCE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true + sudo chmod 700 "${ORIGINAL_SOURCE}/datadir/haf_db_store/pgdata" + fi fi # Run original entrypoint -- GitLab From 04b376ec548bf33741c2431d67bc9474b8c163ac Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 07:56:41 -0500 Subject: [PATCH 064/108] Fix service container: check PG_VERSION for complete cache before starting Service container was using local cache if datadir existed, but this could be a partial/incomplete extraction. Now checks for PG_VERSION file as the completion marker before using the cache. If cache is incomplete, waits for main job extraction to finish. --- .gitlab-ci.yml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index dfce6133..10ec9944 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -649,9 +649,12 @@ sync: NFS_TYPE="${DATA_SOURCE_NFS_TYPE:-haf_sync}" NFS_KEY="${DATA_SOURCE_NFS_KEY}" - # If original path doesn't exist, try NFS fallback or wait for main job extraction - if [[ ! -d "${ORIGINAL_SOURCE}/datadir" ]]; then - echo "Local cache not found: ${ORIGINAL_SOURCE}/datadir" + # Check for complete extraction using PG_VERSION as marker + PGDATA_DIR="${ORIGINAL_SOURCE}/datadir/haf_db_store/pgdata" + + # If cache is not complete, try NFS fallback or wait for main job extraction + if [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; then + echo "Cache not complete (PG_VERSION missing): ${PGDATA_DIR}/PG_VERSION" NFS_PATH="${NFS_PREFIX}/${NFS_TYPE}/${NFS_KEY}" NFS_TAR="${NFS_PATH}.tar" @@ -683,7 +686,6 @@ sync: # NFS not available in this container - wait for main job to extract cache # The main job's before_script will extract NFS cache to local /cache/ path # Wait for PG_VERSION as completion marker (main job may remove partial extractions) - PGDATA_DIR="${ORIGINAL_SOURCE}/datadir/haf_db_store/pgdata" echo "NFS tar not found - waiting for main job to extract cache..." echo "Waiting for completion marker: ${PGDATA_DIR}/PG_VERSION" WAIT_TIMEOUT=600 @@ -703,7 +705,7 @@ sync: export DATA_SOURCE="${ORIGINAL_SOURCE}" fi else - echo "Using local cache: ${ORIGINAL_SOURCE}" + echo "Cache complete, using local cache: ${ORIGINAL_SOURCE}" # Ensure PostgreSQL data permissions are correct if [[ -d "${ORIGINAL_SOURCE}/datadir/haf_db_store" ]]; then echo "Fixing PostgreSQL data permissions..." -- GitLab From a2b5b75e74434dcf726be34a504477c065bfbf25 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 08:16:52 -0500 Subject: [PATCH 065/108] Fix service container: always wait for main job extraction, never extract from NFS Service container was also extracting from NFS, racing with main job. This caused the partial extraction to be destroyed when main job removed and re-extracted. Now service container ALWAYS waits for main job to complete extraction, avoiding the race condition entirely. --- .gitlab-ci.yml | 71 ++++++++++++++++---------------------------------- 1 file changed, 23 insertions(+), 48 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 10ec9944..9b43dc31 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -652,57 +652,32 @@ sync: # Check for complete extraction using PG_VERSION as marker PGDATA_DIR="${ORIGINAL_SOURCE}/datadir/haf_db_store/pgdata" - # If cache is not complete, try NFS fallback or wait for main job extraction + # If cache is not complete, wait for main job to extract it + # NOTE: Do NOT extract from NFS here - main job handles extraction to avoid race conditions if [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; then echo "Cache not complete (PG_VERSION missing): ${PGDATA_DIR}/PG_VERSION" - NFS_PATH="${NFS_PREFIX}/${NFS_TYPE}/${NFS_KEY}" - NFS_TAR="${NFS_PATH}.tar" - - # Debug: show what we're looking for - echo "NFS_PREFIX=${NFS_PREFIX} NFS_TYPE=${NFS_TYPE} NFS_KEY=${NFS_KEY}" - echo "Checking NFS path: ${NFS_PATH}" - echo "Checking NFS tar: ${NFS_TAR}" - ls -la "${NFS_PREFIX}/${NFS_TYPE}/" 2>&1 | head -5 || echo "Cannot list NFS directory (NFS may not be mounted in container)" - - if [[ -d "${NFS_PATH}/datadir" ]]; then - echo "Found data on NFS directory: $NFS_PATH" - export DATA_SOURCE="$NFS_PATH" - elif [[ -f "${NFS_TAR}" ]]; then - echo "Found NFS tar archive: $NFS_TAR ($(stat -c%s "$NFS_TAR" 2>/dev/null || echo 'size unknown') bytes)" - echo "Extracting to: ${ORIGINAL_SOURCE}" - mkdir -p "${ORIGINAL_SOURCE}" - tar xf "${NFS_TAR}" -C "${ORIGINAL_SOURCE}" - # Restore pgdata ownership and permissions for PostgreSQL - # PostgreSQL requires pgdata to be owned by postgres (UID 105) with mode 700 - if [[ -d "${ORIGINAL_SOURCE}/datadir/haf_db_store" ]]; then - echo "Fixing PostgreSQL data permissions..." - sudo chown -R 105:109 "${ORIGINAL_SOURCE}/datadir/haf_db_store" - sudo chown -R 105:109 "${ORIGINAL_SOURCE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true - sudo chmod 700 "${ORIGINAL_SOURCE}/datadir/haf_db_store/pgdata" + echo "Waiting for main job to extract cache..." + echo "Waiting for completion marker: ${PGDATA_DIR}/PG_VERSION" + WAIT_TIMEOUT=600 + ELAPSED=0 + while [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; do + if [[ $ELAPSED -ge $WAIT_TIMEOUT ]]; then + echo "ERROR: Timeout waiting for cache extraction" + echo "Main job should have extracted NFS cache to: ${ORIGINAL_SOURCE}" + ls -la "${ORIGINAL_SOURCE}/" 2>&1 | head -10 || echo "Cannot list cache dir" + exit 1 fi - echo "Extracted NFS cache successfully" - export DATA_SOURCE="${ORIGINAL_SOURCE}" - else - # NFS not available in this container - wait for main job to extract cache - # The main job's before_script will extract NFS cache to local /cache/ path - # Wait for PG_VERSION as completion marker (main job may remove partial extractions) - echo "NFS tar not found - waiting for main job to extract cache..." - echo "Waiting for completion marker: ${PGDATA_DIR}/PG_VERSION" - WAIT_TIMEOUT=600 - ELAPSED=0 - while [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; do - if [[ $ELAPSED -ge $WAIT_TIMEOUT ]]; then - echo "ERROR: Timeout waiting for cache extraction" - echo "Main job should have extracted NFS cache to: ${ORIGINAL_SOURCE}" - ls -la "${ORIGINAL_SOURCE}/" 2>&1 | head -10 || echo "Cannot list cache dir" - exit 1 - fi - sleep 5 - ELAPSED=$((ELAPSED + 5)) - echo "Waiting for cache extraction... (${ELAPSED}s/${WAIT_TIMEOUT}s)" - done - echo "Cache ready after ${ELAPSED}s" - export DATA_SOURCE="${ORIGINAL_SOURCE}" + sleep 5 + ELAPSED=$((ELAPSED + 5)) + echo "Waiting for cache extraction... (${ELAPSED}s/${WAIT_TIMEOUT}s)" + done + echo "Cache ready after ${ELAPSED}s" + # Ensure PostgreSQL data permissions are correct after main job extraction + if [[ -d "${ORIGINAL_SOURCE}/datadir/haf_db_store" ]]; then + echo "Fixing PostgreSQL data permissions..." + sudo chown -R 105:109 "${ORIGINAL_SOURCE}/datadir/haf_db_store" + sudo chown -R 105:109 "${ORIGINAL_SOURCE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true + sudo chmod 700 "${ORIGINAL_SOURCE}/datadir/haf_db_store/pgdata" fi else echo "Cache complete, using local cache: ${ORIGINAL_SOURCE}" -- GitLab From f694bc700a31e50d56dc842b871ba1fd3ea32fc8 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 08:47:54 -0500 Subject: [PATCH 066/108] Simplify service container entrypoint for better reliability - Removed -x flag to reduce shell overhead - Simplified variable handling - Made permission fix errors non-fatal - Always wait for main job extraction to complete --- .gitlab-ci.yml | 67 ++++++++++++++++---------------------------------- 1 file changed, 21 insertions(+), 46 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9b43dc31..8f3d61b5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -642,55 +642,30 @@ sync: - '/bin/bash' - '-c' - | - set -xeuo pipefail - echo "Checking data source availability..." - ORIGINAL_SOURCE="${DATA_SOURCE}" - NFS_PREFIX="${DATA_SOURCE_NFS_PREFIX:-/nfs/ci-cache}" - NFS_TYPE="${DATA_SOURCE_NFS_TYPE:-haf_sync}" - NFS_KEY="${DATA_SOURCE_NFS_KEY}" + set -euo pipefail + # Wait for main job to extract cache, then start PostgreSQL + PGDATA_DIR="${DATA_SOURCE}/datadir/haf_db_store/pgdata" + echo "Service container waiting for cache: ${PGDATA_DIR}/PG_VERSION" - # Check for complete extraction using PG_VERSION as marker - PGDATA_DIR="${ORIGINAL_SOURCE}/datadir/haf_db_store/pgdata" - - # If cache is not complete, wait for main job to extract it - # NOTE: Do NOT extract from NFS here - main job handles extraction to avoid race conditions - if [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; then - echo "Cache not complete (PG_VERSION missing): ${PGDATA_DIR}/PG_VERSION" - echo "Waiting for main job to extract cache..." - echo "Waiting for completion marker: ${PGDATA_DIR}/PG_VERSION" - WAIT_TIMEOUT=600 - ELAPSED=0 - while [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; do - if [[ $ELAPSED -ge $WAIT_TIMEOUT ]]; then - echo "ERROR: Timeout waiting for cache extraction" - echo "Main job should have extracted NFS cache to: ${ORIGINAL_SOURCE}" - ls -la "${ORIGINAL_SOURCE}/" 2>&1 | head -10 || echo "Cannot list cache dir" - exit 1 - fi - sleep 5 - ELAPSED=$((ELAPSED + 5)) - echo "Waiting for cache extraction... (${ELAPSED}s/${WAIT_TIMEOUT}s)" - done - echo "Cache ready after ${ELAPSED}s" - # Ensure PostgreSQL data permissions are correct after main job extraction - if [[ -d "${ORIGINAL_SOURCE}/datadir/haf_db_store" ]]; then - echo "Fixing PostgreSQL data permissions..." - sudo chown -R 105:109 "${ORIGINAL_SOURCE}/datadir/haf_db_store" - sudo chown -R 105:109 "${ORIGINAL_SOURCE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true - sudo chmod 700 "${ORIGINAL_SOURCE}/datadir/haf_db_store/pgdata" - fi - else - echo "Cache complete, using local cache: ${ORIGINAL_SOURCE}" - # Ensure PostgreSQL data permissions are correct - if [[ -d "${ORIGINAL_SOURCE}/datadir/haf_db_store" ]]; then - echo "Fixing PostgreSQL data permissions..." - sudo chown -R 105:109 "${ORIGINAL_SOURCE}/datadir/haf_db_store" - sudo chown -R 105:109 "${ORIGINAL_SOURCE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true - sudo chmod 700 "${ORIGINAL_SOURCE}/datadir/haf_db_store/pgdata" + # Wait for PG_VERSION to appear (main job handles extraction) + WAIT_TIMEOUT=600 + ELAPSED=0 + while [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; do + if [[ $ELAPSED -ge $WAIT_TIMEOUT ]]; then + echo "ERROR: Timeout waiting for cache (${WAIT_TIMEOUT}s)" + ls -la "${DATA_SOURCE}/" 2>&1 || true + exit 1 fi - fi + sleep 5 + ELAPSED=$((ELAPSED + 5)) + done + echo "Cache ready after ${ELAPSED}s, starting PostgreSQL..." + + # Fix permissions and start PostgreSQL + sudo chown -R 105:109 "${DATA_SOURCE}/datadir/haf_db_store" 2>/dev/null || true + sudo chown -R 105:109 "${DATA_SOURCE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true + sudo chmod 700 "${PGDATA_DIR}" 2>/dev/null || true - # Run original entrypoint exec /home/haf_admin/docker_entrypoint.sh "$@" - '/bin/bash' command: ["--execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh"] -- GitLab From cb10ef640e59f29ad7f8fa92d727accac5df0ea6 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 09:17:29 -0500 Subject: [PATCH 067/108] Fix service container: extract from NFS directly instead of waiting for main job Service containers start BEFORE main job's before_script, so waiting for main job to extract cache causes a deadlock - GitLab health check times out waiting for PostgreSQL before main job can even start. Solution: Service container now extracts from NFS directly using a lock file mechanism to prevent race conditions with other processes. --- .gitlab-ci.yml | 126 +++++++++++++++++++++---------------------------- 1 file changed, 54 insertions(+), 72 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8f3d61b5..59f88dff 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -643,23 +643,42 @@ sync: - '-c' - | set -euo pipefail - # Wait for main job to extract cache, then start PostgreSQL PGDATA_DIR="${DATA_SOURCE}/datadir/haf_db_store/pgdata" - echo "Service container waiting for cache: ${PGDATA_DIR}/PG_VERSION" - - # Wait for PG_VERSION to appear (main job handles extraction) - WAIT_TIMEOUT=600 - ELAPSED=0 - while [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; do - if [[ $ELAPSED -ge $WAIT_TIMEOUT ]]; then - echo "ERROR: Timeout waiting for cache (${WAIT_TIMEOUT}s)" - ls -la "${DATA_SOURCE}/" 2>&1 || true - exit 1 + NFS_TAR="${DATA_SOURCE_NFS_PREFIX}/${DATA_SOURCE_NFS_TYPE}/${DATA_SOURCE_NFS_KEY}.tar" + LOCK_FILE="${DATA_SOURCE}.extracting" + echo "Service: checking cache at ${PGDATA_DIR}" + + # Check if cache is already complete + if [[ -f "${PGDATA_DIR}/PG_VERSION" ]]; then + echo "Service: local cache ready" + elif [[ -f "$NFS_TAR" ]]; then + echo "Service: extracting from NFS: $NFS_TAR" + # Use lock file to prevent race conditions with main job + if sudo mkdir "$LOCK_FILE" 2>/dev/null; then + # We got the lock - extract the cache + echo "Service: acquired lock, extracting..." + sudo mkdir -p "${DATA_SOURCE}" + sudo tar xf "$NFS_TAR" -C "${DATA_SOURCE}" + sudo rmdir "$LOCK_FILE" + echo "Service: extraction complete" + else + # Another process is extracting - wait for it + echo "Service: waiting for extraction to complete..." + WAIT_TIMEOUT=300 + ELAPSED=0 + while [[ -d "$LOCK_FILE" ]] && [[ $ELAPSED -lt $WAIT_TIMEOUT ]]; do + sleep 5 + ELAPSED=$((ELAPSED + 5)) + done + if [[ -d "$LOCK_FILE" ]]; then + echo "Service: lock timeout, cleaning stale lock" + sudo rmdir "$LOCK_FILE" 2>/dev/null || true + fi fi - sleep 5 - ELAPSED=$((ELAPSED + 5)) - done - echo "Cache ready after ${ELAPSED}s, starting PostgreSQL..." + else + echo "Service: ERROR - no local cache and no NFS tar at $NFS_TAR" + exit 1 + fi # Fix permissions and start PostgreSQL sudo chown -R 105:109 "${DATA_SOURCE}/datadir/haf_db_store" 2>/dev/null || true @@ -685,71 +704,34 @@ sync: HEALTHCHECK_TCP_PORT: 3000 # Extension that extracts NFS cache before waiting for PostgreSQL -# Service container waits for this extraction to complete +# Service container also extracts, uses lock file to prevent race conditions .wait-for-haf-postgres-with-nfs: extends: .wait-for-haf-postgres before_script: - | - # Extract NFS cache to local /cache/ so service container can find it - # Check for pgdata (not just datadir) to ensure extraction is complete + # Service container should have extracted, but check/fix permissions if needed LOCAL_CACHE="${DATA_CACHE_HAF_PREFIX}_${HAFBE_CACHE_KEY}" - NFS_TAR="${DATA_CACHE_NFS_PREFIX}/${HAFBE_SYNC_CACHE_TYPE}/${HAFBE_CACHE_KEY}.tar" PGDATA_DIR="${LOCAL_CACHE}/datadir/haf_db_store/pgdata" - echo "Checking cache: LOCAL_CACHE=${LOCAL_CACHE}, NFS_TAR=${NFS_TAR}" - # Check for complete extraction (pgdata/PG_VERSION as completion marker) + LOCK_FILE="${LOCAL_CACHE}.extracting" + echo "Main job: checking cache at ${PGDATA_DIR}" + + # Wait for any extraction in progress (lock file present) + if [[ -d "$LOCK_FILE" ]]; then + echo "Main job: extraction in progress, waiting..." + WAIT_TIMEOUT=300 + ELAPSED=0 + while [[ -d "$LOCK_FILE" ]] && [[ $ELAPSED -lt $WAIT_TIMEOUT ]]; do + sleep 5 + ELAPSED=$((ELAPSED + 5)) + done + fi + + # Check if cache is ready if [[ -f "${PGDATA_DIR}/PG_VERSION" ]]; then - echo "Local cache ready: ${LOCAL_CACHE}" - elif [[ -f "$NFS_TAR" ]]; then - echo "Cache not ready (pgdata/PG_VERSION missing), will extract from NFS..." - # If partial extraction exists, try to remove it (service container may be extracting) - if [[ -d "${LOCAL_CACHE}/datadir" ]]; then - echo "Partial extraction found, attempting to remove..." - # Try to remove - if it fails (service container has files open), wait for service - if ! sudo rm -rf "${LOCAL_CACHE}/datadir" 2>/dev/null; then - echo "Could not remove (service container extracting), waiting for completion..." - WAIT_TIMEOUT=600 - ELAPSED=0 - while [[ ! -f "${PGDATA_DIR}/PG_VERSION" ]]; do - if [[ $ELAPSED -ge $WAIT_TIMEOUT ]]; then - echo "ERROR: Timeout waiting for service container extraction" - ls -la "${LOCAL_CACHE}/" 2>&1 | head -10 || true - exit 1 - fi - sleep 10 - ELAPSED=$((ELAPSED + 10)) - echo "Waiting for service extraction... (${ELAPSED}s/${WAIT_TIMEOUT}s)" - done - echo "Service extraction completed after ${ELAPSED}s" - else - echo "Removed partial extraction, will extract fresh..." - echo "Extracting NFS cache for service container..." - sudo mkdir -p "${LOCAL_CACHE}" - sudo tar xf "$NFS_TAR" -C "${LOCAL_CACHE}" - # Restore pgdata ownership and permissions for PostgreSQL - if [[ -d "${LOCAL_CACHE}/datadir/haf_db_store" ]]; then - echo "Fixing PostgreSQL data permissions..." - sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_db_store" - sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true - sudo chmod 700 "${LOCAL_CACHE}/datadir/haf_db_store/pgdata" - fi - echo "NFS cache extracted to: ${LOCAL_CACHE}" - fi - else - echo "No existing extraction, extracting fresh from NFS..." - sudo mkdir -p "${LOCAL_CACHE}" - sudo tar xf "$NFS_TAR" -C "${LOCAL_CACHE}" - # Restore pgdata ownership and permissions for PostgreSQL - if [[ -d "${LOCAL_CACHE}/datadir/haf_db_store" ]]; then - echo "Fixing PostgreSQL data permissions..." - sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_db_store" - sudo chown -R 105:109 "${LOCAL_CACHE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true - sudo chmod 700 "${LOCAL_CACHE}/datadir/haf_db_store/pgdata" - fi - echo "NFS cache extracted to: ${LOCAL_CACHE}" - fi + echo "Main job: cache ready" else - echo "WARNING: NFS cache not found: $NFS_TAR" - ls -la "${DATA_CACHE_NFS_PREFIX}/${HAFBE_SYNC_CACHE_TYPE}/" 2>&1 | head -5 || echo "Cannot list NFS directory" + echo "Main job: cache not ready, service container should have extracted" + ls -la "${LOCAL_CACHE}/" 2>&1 | head -10 || echo "Cannot list cache directory" fi - !reference [.wait-for-haf-postgres, before_script] -- GitLab From 59a79e6a58319dd5ee18d182655d9b477961e856 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 09:39:23 -0500 Subject: [PATCH 068/108] Fix service container entrypoint format for proper execution The multi-line YAML entrypoint format with an extra '/bin/bash' at the end was causing the script to not execute properly. Convert to single-line JSON array format which is more reliable for GitLab CI service entrypoints. --- .gitlab-ci.yml | 50 +------------------------------------------------- 1 file changed, 1 insertion(+), 49 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 59f88dff..8435d683 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -638,55 +638,7 @@ sync: DATA_SOURCE_NFS_PREFIX: "${DATA_CACHE_NFS_PREFIX}" DATA_SOURCE_NFS_TYPE: "${HAFBE_SYNC_CACHE_TYPE}" DATA_SOURCE_NFS_KEY: "${HAFBE_CACHE_KEY}" - entrypoint: - - '/bin/bash' - - '-c' - - | - set -euo pipefail - PGDATA_DIR="${DATA_SOURCE}/datadir/haf_db_store/pgdata" - NFS_TAR="${DATA_SOURCE_NFS_PREFIX}/${DATA_SOURCE_NFS_TYPE}/${DATA_SOURCE_NFS_KEY}.tar" - LOCK_FILE="${DATA_SOURCE}.extracting" - echo "Service: checking cache at ${PGDATA_DIR}" - - # Check if cache is already complete - if [[ -f "${PGDATA_DIR}/PG_VERSION" ]]; then - echo "Service: local cache ready" - elif [[ -f "$NFS_TAR" ]]; then - echo "Service: extracting from NFS: $NFS_TAR" - # Use lock file to prevent race conditions with main job - if sudo mkdir "$LOCK_FILE" 2>/dev/null; then - # We got the lock - extract the cache - echo "Service: acquired lock, extracting..." - sudo mkdir -p "${DATA_SOURCE}" - sudo tar xf "$NFS_TAR" -C "${DATA_SOURCE}" - sudo rmdir "$LOCK_FILE" - echo "Service: extraction complete" - else - # Another process is extracting - wait for it - echo "Service: waiting for extraction to complete..." - WAIT_TIMEOUT=300 - ELAPSED=0 - while [[ -d "$LOCK_FILE" ]] && [[ $ELAPSED -lt $WAIT_TIMEOUT ]]; do - sleep 5 - ELAPSED=$((ELAPSED + 5)) - done - if [[ -d "$LOCK_FILE" ]]; then - echo "Service: lock timeout, cleaning stale lock" - sudo rmdir "$LOCK_FILE" 2>/dev/null || true - fi - fi - else - echo "Service: ERROR - no local cache and no NFS tar at $NFS_TAR" - exit 1 - fi - - # Fix permissions and start PostgreSQL - sudo chown -R 105:109 "${DATA_SOURCE}/datadir/haf_db_store" 2>/dev/null || true - sudo chown -R 105:109 "${DATA_SOURCE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true - sudo chmod 700 "${PGDATA_DIR}" 2>/dev/null || true - - exec /home/haf_admin/docker_entrypoint.sh "$@" - - '/bin/bash' + entrypoint: ["/bin/bash", "-c", "set -euo pipefail; PGDATA_DIR=\"${DATA_SOURCE}/datadir/haf_db_store/pgdata\"; NFS_TAR=\"${DATA_SOURCE_NFS_PREFIX}/${DATA_SOURCE_NFS_TYPE}/${DATA_SOURCE_NFS_KEY}.tar\"; LOCK_FILE=\"${DATA_SOURCE}.extracting\"; echo \"Service: checking cache at ${PGDATA_DIR}\"; if [[ -f \"${PGDATA_DIR}/PG_VERSION\" ]]; then echo \"Service: local cache ready\"; elif [[ -f \"$NFS_TAR\" ]]; then echo \"Service: extracting from NFS: $NFS_TAR\"; if sudo mkdir \"$LOCK_FILE\" 2>/dev/null; then echo \"Service: acquired lock, extracting...\"; sudo mkdir -p \"${DATA_SOURCE}\"; sudo tar xf \"$NFS_TAR\" -C \"${DATA_SOURCE}\"; sudo rmdir \"$LOCK_FILE\"; echo \"Service: extraction complete\"; else echo \"Service: waiting for extraction to complete...\"; WAIT_TIMEOUT=300; ELAPSED=0; while [[ -d \"$LOCK_FILE\" ]] && [[ $ELAPSED -lt $WAIT_TIMEOUT ]]; do sleep 5; ELAPSED=$((ELAPSED + 5)); done; if [[ -d \"$LOCK_FILE\" ]]; then echo \"Service: lock timeout, cleaning stale lock\"; sudo rmdir \"$LOCK_FILE\" 2>/dev/null || true; fi; fi; else echo \"Service: ERROR - no local cache and no NFS tar at $NFS_TAR\"; exit 1; fi; sudo chown -R 105:109 \"${DATA_SOURCE}/datadir/haf_db_store\" 2>/dev/null || true; sudo chown -R 105:109 \"${DATA_SOURCE}/datadir/haf_postgresql_conf.d\" 2>/dev/null || true; sudo chmod 700 \"${PGDATA_DIR}\" 2>/dev/null || true; exec /home/haf_admin/docker_entrypoint.sh \"$@\"", "/bin/bash"] command: ["--execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh"] .postgrest-service: &postgrest-service -- GitLab From 5335027caf0f7e5aaaf3aa35b2dc62ae09329416 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 09:59:24 -0500 Subject: [PATCH 069/108] Simplify service container entrypoint: embed command in script Use cleaner multi-line YAML entrypoint format and embed the maintenance script command directly in the entrypoint rather than using separate command. This ensures the entire script runs as intended. --- .gitlab-ci.yml | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8435d683..dd313fdf 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -638,8 +638,46 @@ sync: DATA_SOURCE_NFS_PREFIX: "${DATA_CACHE_NFS_PREFIX}" DATA_SOURCE_NFS_TYPE: "${HAFBE_SYNC_CACHE_TYPE}" DATA_SOURCE_NFS_KEY: "${HAFBE_CACHE_KEY}" - entrypoint: ["/bin/bash", "-c", "set -euo pipefail; PGDATA_DIR=\"${DATA_SOURCE}/datadir/haf_db_store/pgdata\"; NFS_TAR=\"${DATA_SOURCE_NFS_PREFIX}/${DATA_SOURCE_NFS_TYPE}/${DATA_SOURCE_NFS_KEY}.tar\"; LOCK_FILE=\"${DATA_SOURCE}.extracting\"; echo \"Service: checking cache at ${PGDATA_DIR}\"; if [[ -f \"${PGDATA_DIR}/PG_VERSION\" ]]; then echo \"Service: local cache ready\"; elif [[ -f \"$NFS_TAR\" ]]; then echo \"Service: extracting from NFS: $NFS_TAR\"; if sudo mkdir \"$LOCK_FILE\" 2>/dev/null; then echo \"Service: acquired lock, extracting...\"; sudo mkdir -p \"${DATA_SOURCE}\"; sudo tar xf \"$NFS_TAR\" -C \"${DATA_SOURCE}\"; sudo rmdir \"$LOCK_FILE\"; echo \"Service: extraction complete\"; else echo \"Service: waiting for extraction to complete...\"; WAIT_TIMEOUT=300; ELAPSED=0; while [[ -d \"$LOCK_FILE\" ]] && [[ $ELAPSED -lt $WAIT_TIMEOUT ]]; do sleep 5; ELAPSED=$((ELAPSED + 5)); done; if [[ -d \"$LOCK_FILE\" ]]; then echo \"Service: lock timeout, cleaning stale lock\"; sudo rmdir \"$LOCK_FILE\" 2>/dev/null || true; fi; fi; else echo \"Service: ERROR - no local cache and no NFS tar at $NFS_TAR\"; exit 1; fi; sudo chown -R 105:109 \"${DATA_SOURCE}/datadir/haf_db_store\" 2>/dev/null || true; sudo chown -R 105:109 \"${DATA_SOURCE}/datadir/haf_postgresql_conf.d\" 2>/dev/null || true; sudo chmod 700 \"${PGDATA_DIR}\" 2>/dev/null || true; exec /home/haf_admin/docker_entrypoint.sh \"$@\"", "/bin/bash"] - command: ["--execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh"] + entrypoint: + - /bin/bash + - -c + - | + set -euo pipefail + PGDATA_DIR="${DATA_SOURCE}/datadir/haf_db_store/pgdata" + NFS_TAR="${DATA_SOURCE_NFS_PREFIX}/${DATA_SOURCE_NFS_TYPE}/${DATA_SOURCE_NFS_KEY}.tar" + LOCK_FILE="${DATA_SOURCE}.extracting" + echo "Service: checking cache at ${PGDATA_DIR}" + if [[ -f "${PGDATA_DIR}/PG_VERSION" ]]; then + echo "Service: local cache ready" + elif [[ -f "$NFS_TAR" ]]; then + echo "Service: extracting from NFS: $NFS_TAR" + if sudo mkdir "$LOCK_FILE" 2>/dev/null; then + echo "Service: acquired lock, extracting..." + sudo mkdir -p "${DATA_SOURCE}" + sudo tar xf "$NFS_TAR" -C "${DATA_SOURCE}" + sudo rmdir "$LOCK_FILE" + echo "Service: extraction complete" + else + echo "Service: waiting for extraction to complete..." + WAIT_TIMEOUT=300 + ELAPSED=0 + while [[ -d "$LOCK_FILE" ]] && [[ $ELAPSED -lt $WAIT_TIMEOUT ]]; do + sleep 5 + ELAPSED=$((ELAPSED + 5)) + done + if [[ -d "$LOCK_FILE" ]]; then + echo "Service: lock timeout, cleaning stale lock" + sudo rmdir "$LOCK_FILE" 2>/dev/null || true + fi + fi + else + echo "Service: ERROR - no local cache and no NFS tar at $NFS_TAR" + exit 1 + fi + sudo chown -R 105:109 "${DATA_SOURCE}/datadir/haf_db_store" 2>/dev/null || true + sudo chown -R 105:109 "${DATA_SOURCE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true + sudo chmod 700 "${PGDATA_DIR}" 2>/dev/null || true + exec /home/haf_admin/docker_entrypoint.sh --execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh .postgrest-service: &postgrest-service name: registry.gitlab.syncad.com/hive/common-ci-configuration/postgrest:v12.0.2 -- GitLab From 4517ce949b98b3a63c00664fd68427167a6d142c Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 10:24:15 -0500 Subject: [PATCH 070/108] Fix service entrypoint: use explicit JSON array format GitLab CI service entrypoint override requires explicit JSON array format. Multi-line YAML with | was not being applied correctly. Also clear command with empty array to prevent image's default CMD from interfering. --- .gitlab-ci.yml | 42 ++---------------------------------------- 1 file changed, 2 insertions(+), 40 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index dd313fdf..6016fd9f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -638,46 +638,8 @@ sync: DATA_SOURCE_NFS_PREFIX: "${DATA_CACHE_NFS_PREFIX}" DATA_SOURCE_NFS_TYPE: "${HAFBE_SYNC_CACHE_TYPE}" DATA_SOURCE_NFS_KEY: "${HAFBE_CACHE_KEY}" - entrypoint: - - /bin/bash - - -c - - | - set -euo pipefail - PGDATA_DIR="${DATA_SOURCE}/datadir/haf_db_store/pgdata" - NFS_TAR="${DATA_SOURCE_NFS_PREFIX}/${DATA_SOURCE_NFS_TYPE}/${DATA_SOURCE_NFS_KEY}.tar" - LOCK_FILE="${DATA_SOURCE}.extracting" - echo "Service: checking cache at ${PGDATA_DIR}" - if [[ -f "${PGDATA_DIR}/PG_VERSION" ]]; then - echo "Service: local cache ready" - elif [[ -f "$NFS_TAR" ]]; then - echo "Service: extracting from NFS: $NFS_TAR" - if sudo mkdir "$LOCK_FILE" 2>/dev/null; then - echo "Service: acquired lock, extracting..." - sudo mkdir -p "${DATA_SOURCE}" - sudo tar xf "$NFS_TAR" -C "${DATA_SOURCE}" - sudo rmdir "$LOCK_FILE" - echo "Service: extraction complete" - else - echo "Service: waiting for extraction to complete..." - WAIT_TIMEOUT=300 - ELAPSED=0 - while [[ -d "$LOCK_FILE" ]] && [[ $ELAPSED -lt $WAIT_TIMEOUT ]]; do - sleep 5 - ELAPSED=$((ELAPSED + 5)) - done - if [[ -d "$LOCK_FILE" ]]; then - echo "Service: lock timeout, cleaning stale lock" - sudo rmdir "$LOCK_FILE" 2>/dev/null || true - fi - fi - else - echo "Service: ERROR - no local cache and no NFS tar at $NFS_TAR" - exit 1 - fi - sudo chown -R 105:109 "${DATA_SOURCE}/datadir/haf_db_store" 2>/dev/null || true - sudo chown -R 105:109 "${DATA_SOURCE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true - sudo chmod 700 "${PGDATA_DIR}" 2>/dev/null || true - exec /home/haf_admin/docker_entrypoint.sh --execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh + entrypoint: ["/bin/bash", "-c", "set -euo pipefail; PGDATA_DIR=${DATA_SOURCE}/datadir/haf_db_store/pgdata; NFS_TAR=${DATA_SOURCE_NFS_PREFIX}/${DATA_SOURCE_NFS_TYPE}/${DATA_SOURCE_NFS_KEY}.tar; LOCK_FILE=${DATA_SOURCE}.extracting; echo Service: checking cache at $PGDATA_DIR; if [ -f $PGDATA_DIR/PG_VERSION ]; then echo Service: local cache ready; elif [ -f $NFS_TAR ]; then echo Service: extracting from NFS $NFS_TAR; if sudo mkdir $LOCK_FILE 2>/dev/null; then echo Service: acquired lock extracting; sudo mkdir -p ${DATA_SOURCE}; sudo tar xf $NFS_TAR -C ${DATA_SOURCE}; sudo rmdir $LOCK_FILE; echo Service: extraction complete; else echo Service: waiting for extraction; WAIT_TIMEOUT=300; ELAPSED=0; while [ -d $LOCK_FILE ] && [ $ELAPSED -lt $WAIT_TIMEOUT ]; do sleep 5; ELAPSED=$((ELAPSED + 5)); done; if [ -d $LOCK_FILE ]; then echo Service: lock timeout; sudo rmdir $LOCK_FILE 2>/dev/null || true; fi; fi; else echo Service: ERROR no local cache and no NFS tar; exit 1; fi; sudo chown -R 105:109 ${DATA_SOURCE}/datadir/haf_db_store 2>/dev/null || true; sudo chown -R 105:109 ${DATA_SOURCE}/datadir/haf_postgresql_conf.d 2>/dev/null || true; sudo chmod 700 $PGDATA_DIR 2>/dev/null || true; exec /home/haf_admin/docker_entrypoint.sh --execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh"] + command: [""] .postgrest-service: &postgrest-service name: registry.gitlab.syncad.com/hive/common-ci-configuration/postgrest:v12.0.2 -- GitLab From ffabed4f778ec4a9e26a3b2423f26c929be549fe Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 10:45:21 -0500 Subject: [PATCH 071/108] Fix service entrypoint: clear entrypoint and use command instead GitLab CI service entrypoint override may not work with complex scripts. Try clearing entrypoint with empty array and putting script in command instead, which should allow the container to run our custom script. --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6016fd9f..78ed8bd4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -638,8 +638,8 @@ sync: DATA_SOURCE_NFS_PREFIX: "${DATA_CACHE_NFS_PREFIX}" DATA_SOURCE_NFS_TYPE: "${HAFBE_SYNC_CACHE_TYPE}" DATA_SOURCE_NFS_KEY: "${HAFBE_CACHE_KEY}" - entrypoint: ["/bin/bash", "-c", "set -euo pipefail; PGDATA_DIR=${DATA_SOURCE}/datadir/haf_db_store/pgdata; NFS_TAR=${DATA_SOURCE_NFS_PREFIX}/${DATA_SOURCE_NFS_TYPE}/${DATA_SOURCE_NFS_KEY}.tar; LOCK_FILE=${DATA_SOURCE}.extracting; echo Service: checking cache at $PGDATA_DIR; if [ -f $PGDATA_DIR/PG_VERSION ]; then echo Service: local cache ready; elif [ -f $NFS_TAR ]; then echo Service: extracting from NFS $NFS_TAR; if sudo mkdir $LOCK_FILE 2>/dev/null; then echo Service: acquired lock extracting; sudo mkdir -p ${DATA_SOURCE}; sudo tar xf $NFS_TAR -C ${DATA_SOURCE}; sudo rmdir $LOCK_FILE; echo Service: extraction complete; else echo Service: waiting for extraction; WAIT_TIMEOUT=300; ELAPSED=0; while [ -d $LOCK_FILE ] && [ $ELAPSED -lt $WAIT_TIMEOUT ]; do sleep 5; ELAPSED=$((ELAPSED + 5)); done; if [ -d $LOCK_FILE ]; then echo Service: lock timeout; sudo rmdir $LOCK_FILE 2>/dev/null || true; fi; fi; else echo Service: ERROR no local cache and no NFS tar; exit 1; fi; sudo chown -R 105:109 ${DATA_SOURCE}/datadir/haf_db_store 2>/dev/null || true; sudo chown -R 105:109 ${DATA_SOURCE}/datadir/haf_postgresql_conf.d 2>/dev/null || true; sudo chmod 700 $PGDATA_DIR 2>/dev/null || true; exec /home/haf_admin/docker_entrypoint.sh --execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh"] - command: [""] + entrypoint: [""] + command: ["/bin/bash", "-c", "set -euo pipefail; PGDATA_DIR=${DATA_SOURCE}/datadir/haf_db_store/pgdata; NFS_TAR=${DATA_SOURCE_NFS_PREFIX}/${DATA_SOURCE_NFS_TYPE}/${DATA_SOURCE_NFS_KEY}.tar; LOCK_FILE=${DATA_SOURCE}.extracting; echo Service: checking cache at $PGDATA_DIR; if [ -f $PGDATA_DIR/PG_VERSION ]; then echo Service: local cache ready; elif [ -f $NFS_TAR ]; then echo Service: extracting from NFS $NFS_TAR; if sudo mkdir $LOCK_FILE 2>/dev/null; then echo Service: acquired lock extracting; sudo mkdir -p ${DATA_SOURCE}; sudo tar xf $NFS_TAR -C ${DATA_SOURCE}; sudo rmdir $LOCK_FILE; echo Service: extraction complete; else echo Service: waiting for extraction; WAIT_TIMEOUT=300; ELAPSED=0; while [ -d $LOCK_FILE ] && [ $ELAPSED -lt $WAIT_TIMEOUT ]; do sleep 5; ELAPSED=$((ELAPSED + 5)); done; if [ -d $LOCK_FILE ]; then echo Service: lock timeout; sudo rmdir $LOCK_FILE 2>/dev/null || true; fi; fi; else echo Service: ERROR no local cache and no NFS tar; exit 1; fi; sudo chown -R 105:109 ${DATA_SOURCE}/datadir/haf_db_store 2>/dev/null || true; sudo chown -R 105:109 ${DATA_SOURCE}/datadir/haf_postgresql_conf.d 2>/dev/null || true; sudo chmod 700 $PGDATA_DIR 2>/dev/null || true; exec /home/haf_admin/docker_entrypoint.sh --execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh"] .postgrest-service: &postgrest-service name: registry.gitlab.syncad.com/hive/common-ci-configuration/postgrest:v12.0.2 -- GitLab From 10dc07c34ce39e8abebfb14d0cc98d22892056a2 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 11:06:13 -0500 Subject: [PATCH 072/108] Fix service entrypoint: use bash + command with multi-line script Use explicit /bin/bash entrypoint with command containing -c and multi-line script. This is the standard Docker pattern for running custom scripts and should properly override the HAF image's entrypoint. --- .gitlab-ci.yml | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 78ed8bd4..6dde1ef8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -638,8 +638,47 @@ sync: DATA_SOURCE_NFS_PREFIX: "${DATA_CACHE_NFS_PREFIX}" DATA_SOURCE_NFS_TYPE: "${HAFBE_SYNC_CACHE_TYPE}" DATA_SOURCE_NFS_KEY: "${HAFBE_CACHE_KEY}" - entrypoint: [""] - command: ["/bin/bash", "-c", "set -euo pipefail; PGDATA_DIR=${DATA_SOURCE}/datadir/haf_db_store/pgdata; NFS_TAR=${DATA_SOURCE_NFS_PREFIX}/${DATA_SOURCE_NFS_TYPE}/${DATA_SOURCE_NFS_KEY}.tar; LOCK_FILE=${DATA_SOURCE}.extracting; echo Service: checking cache at $PGDATA_DIR; if [ -f $PGDATA_DIR/PG_VERSION ]; then echo Service: local cache ready; elif [ -f $NFS_TAR ]; then echo Service: extracting from NFS $NFS_TAR; if sudo mkdir $LOCK_FILE 2>/dev/null; then echo Service: acquired lock extracting; sudo mkdir -p ${DATA_SOURCE}; sudo tar xf $NFS_TAR -C ${DATA_SOURCE}; sudo rmdir $LOCK_FILE; echo Service: extraction complete; else echo Service: waiting for extraction; WAIT_TIMEOUT=300; ELAPSED=0; while [ -d $LOCK_FILE ] && [ $ELAPSED -lt $WAIT_TIMEOUT ]; do sleep 5; ELAPSED=$((ELAPSED + 5)); done; if [ -d $LOCK_FILE ]; then echo Service: lock timeout; sudo rmdir $LOCK_FILE 2>/dev/null || true; fi; fi; else echo Service: ERROR no local cache and no NFS tar; exit 1; fi; sudo chown -R 105:109 ${DATA_SOURCE}/datadir/haf_db_store 2>/dev/null || true; sudo chown -R 105:109 ${DATA_SOURCE}/datadir/haf_postgresql_conf.d 2>/dev/null || true; sudo chmod 700 $PGDATA_DIR 2>/dev/null || true; exec /home/haf_admin/docker_entrypoint.sh --execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh"] + entrypoint: + - /bin/bash + command: + - -c + - | + set -euo pipefail + PGDATA_DIR="${DATA_SOURCE}/datadir/haf_db_store/pgdata" + NFS_TAR="${DATA_SOURCE_NFS_PREFIX}/${DATA_SOURCE_NFS_TYPE}/${DATA_SOURCE_NFS_KEY}.tar" + LOCK_FILE="${DATA_SOURCE}.extracting" + echo "Service: checking cache at $PGDATA_DIR" + if [ -f "$PGDATA_DIR/PG_VERSION" ]; then + echo "Service: local cache ready" + elif [ -f "$NFS_TAR" ]; then + echo "Service: extracting from NFS $NFS_TAR" + if sudo mkdir "$LOCK_FILE" 2>/dev/null; then + echo "Service: acquired lock, extracting..." + sudo mkdir -p "${DATA_SOURCE}" + sudo tar xf "$NFS_TAR" -C "${DATA_SOURCE}" + sudo rmdir "$LOCK_FILE" + echo "Service: extraction complete" + else + echo "Service: waiting for extraction..." + WAIT_TIMEOUT=300 + ELAPSED=0 + while [ -d "$LOCK_FILE" ] && [ $ELAPSED -lt $WAIT_TIMEOUT ]; do + sleep 5 + ELAPSED=$((ELAPSED + 5)) + done + if [ -d "$LOCK_FILE" ]; then + echo "Service: lock timeout" + sudo rmdir "$LOCK_FILE" 2>/dev/null || true + fi + fi + else + echo "Service: ERROR no local cache and no NFS tar at $NFS_TAR" + exit 1 + fi + sudo chown -R 105:109 "${DATA_SOURCE}/datadir/haf_db_store" 2>/dev/null || true + sudo chown -R 105:109 "${DATA_SOURCE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true + sudo chmod 700 "$PGDATA_DIR" 2>/dev/null || true + exec /home/haf_admin/docker_entrypoint.sh --execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh .postgrest-service: &postgrest-service name: registry.gitlab.syncad.com/hive/common-ci-configuration/postgrest:v12.0.2 -- GitLab From 40fdc85f9199398fae140d7cb498dad64533f2cf Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 11:33:42 -0500 Subject: [PATCH 073/108] Use HAF built-in NFS fallback instead of service entrypoint override Service entrypoint override doesn't work reliably in GitLab CI. HAF's copy_datadir.sh already supports NFS cache extraction: - Parses DATA_SOURCE path to derive cache type and key - Looks for tar at /nfs/ci-cache/{type}/{key}.tar - Extracts and sets permissions automatically Changed DATA_SOURCE format from: /cache/replay_data_haf_{key} to: /cache/hafbe_sync_{key} This matches the NFS tar path structure and allows HAF to extract the cache automatically during startup. --- .gitlab-ci.yml | 88 +++++++++++--------------------------------------- 1 file changed, 18 insertions(+), 70 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6dde1ef8..3a9000d9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -623,6 +623,9 @@ sync: - data-cache-storage - fast +# HAF service container with NFS fallback +# Uses HAF's built-in NFS cache extraction (in copy_datadir.sh) +# DATA_SOURCE format: /cache/{type}_{key} -> looks for /nfs/ci-cache/{type}/{key}.tar .haf-instance-with-nfs-fallback: &haf-instance-with-nfs-fallback name: ${HAF_IMAGE_NAME} alias: haf-instance @@ -634,51 +637,11 @@ sync: "host all hafbe_user 0.0.0.0/0 trust" "host all hafbe_owner 0.0.0.0/0 trust" "host all all 0.0.0.0/0 scram-sha-256" - DATA_SOURCE: "${DATA_CACHE_HAF_PREFIX}_${HAFBE_CACHE_KEY}" - DATA_SOURCE_NFS_PREFIX: "${DATA_CACHE_NFS_PREFIX}" - DATA_SOURCE_NFS_TYPE: "${HAFBE_SYNC_CACHE_TYPE}" - DATA_SOURCE_NFS_KEY: "${HAFBE_CACHE_KEY}" - entrypoint: - - /bin/bash - command: - - -c - - | - set -euo pipefail - PGDATA_DIR="${DATA_SOURCE}/datadir/haf_db_store/pgdata" - NFS_TAR="${DATA_SOURCE_NFS_PREFIX}/${DATA_SOURCE_NFS_TYPE}/${DATA_SOURCE_NFS_KEY}.tar" - LOCK_FILE="${DATA_SOURCE}.extracting" - echo "Service: checking cache at $PGDATA_DIR" - if [ -f "$PGDATA_DIR/PG_VERSION" ]; then - echo "Service: local cache ready" - elif [ -f "$NFS_TAR" ]; then - echo "Service: extracting from NFS $NFS_TAR" - if sudo mkdir "$LOCK_FILE" 2>/dev/null; then - echo "Service: acquired lock, extracting..." - sudo mkdir -p "${DATA_SOURCE}" - sudo tar xf "$NFS_TAR" -C "${DATA_SOURCE}" - sudo rmdir "$LOCK_FILE" - echo "Service: extraction complete" - else - echo "Service: waiting for extraction..." - WAIT_TIMEOUT=300 - ELAPSED=0 - while [ -d "$LOCK_FILE" ] && [ $ELAPSED -lt $WAIT_TIMEOUT ]; do - sleep 5 - ELAPSED=$((ELAPSED + 5)) - done - if [ -d "$LOCK_FILE" ]; then - echo "Service: lock timeout" - sudo rmdir "$LOCK_FILE" 2>/dev/null || true - fi - fi - else - echo "Service: ERROR no local cache and no NFS tar at $NFS_TAR" - exit 1 - fi - sudo chown -R 105:109 "${DATA_SOURCE}/datadir/haf_db_store" 2>/dev/null || true - sudo chown -R 105:109 "${DATA_SOURCE}/datadir/haf_postgresql_conf.d" 2>/dev/null || true - sudo chmod 700 "$PGDATA_DIR" 2>/dev/null || true - exec /home/haf_admin/docker_entrypoint.sh --execute-maintenance-script=${HAF_SOURCE_DIR}/scripts/maintenance-scripts/sleep_infinity.sh + # DATA_SOURCE format for HAF NFS fallback: /cache/{type}_{key} + # HAF parses this to find NFS tar at /nfs/ci-cache/{type}/{key}.tar + DATA_SOURCE: "/cache/${HAFBE_SYNC_CACHE_TYPE}_${HAFBE_CACHE_KEY}" + # NFS path where cache-manager.sh looks for tars + CACHE_NFS_PATH: "${DATA_CACHE_NFS_PREFIX}" .postgrest-service: &postgrest-service name: registry.gitlab.syncad.com/hive/common-ci-configuration/postgrest:v12.0.2 @@ -694,36 +657,21 @@ sync: PGRST_DB_EXTRA_SEARCH_PATH: hafbe_bal, reptracker_app HEALTHCHECK_TCP_PORT: 3000 -# Extension that extracts NFS cache before waiting for PostgreSQL -# Service container also extracts, uses lock file to prevent race conditions +# Extension for jobs using HAF service with NFS fallback +# HAF entrypoint handles NFS extraction automatically via copy_datadir.sh .wait-for-haf-postgres-with-nfs: extends: .wait-for-haf-postgres before_script: - | - # Service container should have extracted, but check/fix permissions if needed - LOCAL_CACHE="${DATA_CACHE_HAF_PREFIX}_${HAFBE_CACHE_KEY}" + # Log cache path info for debugging + LOCAL_CACHE="/cache/${HAFBE_SYNC_CACHE_TYPE}_${HAFBE_CACHE_KEY}" PGDATA_DIR="${LOCAL_CACHE}/datadir/haf_db_store/pgdata" - LOCK_FILE="${LOCAL_CACHE}.extracting" - echo "Main job: checking cache at ${PGDATA_DIR}" - - # Wait for any extraction in progress (lock file present) - if [[ -d "$LOCK_FILE" ]]; then - echo "Main job: extraction in progress, waiting..." - WAIT_TIMEOUT=300 - ELAPSED=0 - while [[ -d "$LOCK_FILE" ]] && [[ $ELAPSED -lt $WAIT_TIMEOUT ]]; do - sleep 5 - ELAPSED=$((ELAPSED + 5)) - done - fi - - # Check if cache is ready - if [[ -f "${PGDATA_DIR}/PG_VERSION" ]]; then - echo "Main job: cache ready" - else - echo "Main job: cache not ready, service container should have extracted" - ls -la "${LOCAL_CACHE}/" 2>&1 | head -10 || echo "Cannot list cache directory" - fi + NFS_TAR="${DATA_CACHE_NFS_PREFIX}/${HAFBE_SYNC_CACHE_TYPE}/${HAFBE_CACHE_KEY}.tar" + echo "Main job: cache info" + echo " LOCAL_CACHE: ${LOCAL_CACHE}" + echo " NFS_TAR: ${NFS_TAR}" + echo " PGDATA would be at: ${PGDATA_DIR}" + ls -la "${LOCAL_CACHE}/" 2>&1 | head -5 || echo " Cache dir not visible to main job (normal - service volume)" - !reference [.wait-for-haf-postgres, before_script] python_api_client_test: -- GitLab From 278c25b056ebefa05f86c2db9f7d8586c775ef43 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 11:52:47 -0500 Subject: [PATCH 074/108] Use direct variable references in service container config Service containers may not expand nested variable references properly. Changed from: DATA_SOURCE: /cache/${HAFBE_SYNC_CACHE_TYPE}_${HAFBE_CACHE_KEY} to: DATA_SOURCE: /cache/hafbe_sync_${HAF_COMMIT}_${CI_COMMIT_SHORT_SHA} Also use literal path for CACHE_NFS_PATH instead of variable reference. Added NFS tar existence check in main job for debugging. --- .gitlab-ci.yml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3a9000d9..72598425 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -639,9 +639,10 @@ sync: "host all all 0.0.0.0/0 scram-sha-256" # DATA_SOURCE format for HAF NFS fallback: /cache/{type}_{key} # HAF parses this to find NFS tar at /nfs/ci-cache/{type}/{key}.tar - DATA_SOURCE: "/cache/${HAFBE_SYNC_CACHE_TYPE}_${HAFBE_CACHE_KEY}" + # Note: Must use HAF_COMMIT directly (not HAFBE_CACHE_KEY) for proper expansion in services + DATA_SOURCE: "/cache/hafbe_sync_${HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}" # NFS path where cache-manager.sh looks for tars - CACHE_NFS_PATH: "${DATA_CACHE_NFS_PREFIX}" + CACHE_NFS_PATH: "/nfs/ci-cache" .postgrest-service: &postgrest-service name: registry.gitlab.syncad.com/hive/common-ci-configuration/postgrest:v12.0.2 @@ -664,14 +665,15 @@ sync: before_script: - | # Log cache path info for debugging - LOCAL_CACHE="/cache/${HAFBE_SYNC_CACHE_TYPE}_${HAFBE_CACHE_KEY}" + LOCAL_CACHE="/cache/hafbe_sync_${HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}" PGDATA_DIR="${LOCAL_CACHE}/datadir/haf_db_store/pgdata" - NFS_TAR="${DATA_CACHE_NFS_PREFIX}/${HAFBE_SYNC_CACHE_TYPE}/${HAFBE_CACHE_KEY}.tar" + NFS_TAR="/nfs/ci-cache/hafbe_sync/${HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}.tar" echo "Main job: cache info" echo " LOCAL_CACHE: ${LOCAL_CACHE}" echo " NFS_TAR: ${NFS_TAR}" echo " PGDATA would be at: ${PGDATA_DIR}" - ls -la "${LOCAL_CACHE}/" 2>&1 | head -5 || echo " Cache dir not visible to main job (normal - service volume)" + echo " NFS tar exists: $(test -f "$NFS_TAR" && echo 'yes' || echo 'no')" + ls -la "${LOCAL_CACHE}/" 2>&1 | head -5 || echo " Cache dir not visible to main job yet" - !reference [.wait-for-haf-postgres, before_script] python_api_client_test: -- GitLab From a78c81014a4c9c73e6f0ff9868b0fe2f165e211b Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 13:34:39 -0500 Subject: [PATCH 075/108] Migrate test jobs to docker-compose pattern (like balance_tracker) - Add common-ci-configuration haf_app_testing templates include - Update detect_changes to use .haf_app_detect_changes template - Create docker-compose-test.yml for lightweight test environment - Add extract-cache-and-wait.sh for explicit cache extraction - Add wait-for-postgrest.sh for service readiness checks - Refactor all test jobs to use docker-compose instead of service containers - Remove old service container definitions (.haf-instance-with-nfs-fallback, etc.) - Update cache type to haf_hafbe_sync (with haf_ prefix for pgdata handling) This aligns haf_block_explorer CI with the common HAF app testing pattern used by balance_tracker, providing better control over container lifecycle and cache extraction without race conditions. --- .gitlab-ci.yml | 301 ++++++++----------- docker/docker-compose-test.yml | 97 ++++++ scripts/ci-helpers/extract-cache-and-wait.sh | 155 ++++++++++ scripts/ci-helpers/skip_rules.yml | 89 ++---- scripts/ci-helpers/wait-for-postgrest.sh | 78 +++++ 5 files changed, 470 insertions(+), 250 deletions(-) create mode 100644 docker/docker-compose-test.yml create mode 100755 scripts/ci-helpers/extract-cache-and-wait.sh create mode 100755 scripts/ci-helpers/wait-for-postgrest.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 72598425..46593537 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,5 @@ stages: +- detect - lint - build - sync @@ -25,7 +26,8 @@ variables: # NFS cache configuration for sync data sharing across builders DATA_CACHE_NFS_PREFIX: "/nfs/ci-cache" HAFBE_CACHE_KEY: "${HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}" - HAFBE_SYNC_CACHE_TYPE: "hafbe_sync" + # Cache type prefixed with haf_ for automatic pgdata permission handling by cache-manager + HAFBE_SYNC_CACHE_TYPE: "haf_hafbe_sync" BLOCK_LOG_SOURCE_DIR_5M: /blockchain/block_log_5m FF_NETWORK_PER_BUILD: 1 PYTEST_NUMBER_OF_PROCESSES: 8 @@ -49,8 +51,11 @@ include: - project: hive/haf ref: b4225f9d2591195b0e6aadf36bbef921d95f92b9 # develop file: /scripts/ci-helpers/prepare_data_image_job.yml # implicitly pulls templates/base.gitlab-ci.yml from common-ci-configuration - # Do not include common-ci-configuration here, it is already referenced by scripts/ci-helpers/prepare_data_image_job.yml included from Haf/Hive repos -# Skip rules for docs-only changes and QUICK_TEST mode +# HAF app testing templates - provides change detection, sync helpers, test base templates +- project: hive/common-ci-configuration + ref: develop + file: /templates/haf_app_testing.gitlab-ci.yml +# Skip rules for docs-only changes and QUICK_TEST mode (to be replaced by common templates) - local: '/scripts/ci-helpers/skip_rules.yml' default: @@ -623,59 +628,6 @@ sync: - data-cache-storage - fast -# HAF service container with NFS fallback -# Uses HAF's built-in NFS cache extraction (in copy_datadir.sh) -# DATA_SOURCE format: /cache/{type}_{key} -> looks for /nfs/ci-cache/{type}/{key}.tar -.haf-instance-with-nfs-fallback: &haf-instance-with-nfs-fallback - name: ${HAF_IMAGE_NAME} - alias: haf-instance - variables: - PGCTLTIMEOUT: 600 - PG_ACCESS: | - "host all haf_admin 0.0.0.0/0 trust" - "host all hived 0.0.0.0/0 trust" - "host all hafbe_user 0.0.0.0/0 trust" - "host all hafbe_owner 0.0.0.0/0 trust" - "host all all 0.0.0.0/0 scram-sha-256" - # DATA_SOURCE format for HAF NFS fallback: /cache/{type}_{key} - # HAF parses this to find NFS tar at /nfs/ci-cache/{type}/{key}.tar - # Note: Must use HAF_COMMIT directly (not HAFBE_CACHE_KEY) for proper expansion in services - DATA_SOURCE: "/cache/hafbe_sync_${HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}" - # NFS path where cache-manager.sh looks for tars - CACHE_NFS_PATH: "/nfs/ci-cache" - -.postgrest-service: &postgrest-service - name: registry.gitlab.syncad.com/hive/common-ci-configuration/postgrest:v12.0.2 - alias: postgrest-server - variables: - PGRST_ADMIN_SERVER_PORT: 3001 - PGRST_SERVER_PORT: 3000 - PGRST_DB_URI: postgresql://haf_admin@haf-instance:5432/haf_block_log - PGRST_DB_SCHEMA: hafbe_endpoints - PGRST_DB_ANON_ROLE: hafbe_user - PGRST_DB_POOL: 20 - PGRST_DB_POOL_ACQUISITION_TIMEOUT: 10 - PGRST_DB_EXTRA_SEARCH_PATH: hafbe_bal, reptracker_app - HEALTHCHECK_TCP_PORT: 3000 - -# Extension for jobs using HAF service with NFS fallback -# HAF entrypoint handles NFS extraction automatically via copy_datadir.sh -.wait-for-haf-postgres-with-nfs: - extends: .wait-for-haf-postgres - before_script: - - | - # Log cache path info for debugging - LOCAL_CACHE="/cache/hafbe_sync_${HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}" - PGDATA_DIR="${LOCAL_CACHE}/datadir/haf_db_store/pgdata" - NFS_TAR="/nfs/ci-cache/hafbe_sync/${HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}.tar" - echo "Main job: cache info" - echo " LOCAL_CACHE: ${LOCAL_CACHE}" - echo " NFS_TAR: ${NFS_TAR}" - echo " PGDATA would be at: ${PGDATA_DIR}" - echo " NFS tar exists: $(test -f "$NFS_TAR" && echo 'yes' || echo 'no')" - ls -la "${LOCAL_CACHE}/" 2>&1 | head -5 || echo " Cache dir not visible to main job yet" - - !reference [.wait-for-haf-postgres, before_script] - python_api_client_test: extends: .project_develop_configuration_template # Use Python 3.12 for consistency with generate_python_api_client @@ -697,182 +649,171 @@ python_api_client_test: tags: - public-runner-docker -regression-test: - extends: .wait-for-haf-postgres-with-nfs - # Use BUILDER_IMAGE with Python (ci-runner doesn't have python3 needed for data_insertion_script.py) - image: $BUILDER_IMAGE_PATH +# ============================================================================= +# TEST JOB BASE TEMPLATE (Docker Compose) +# ============================================================================= +# All test jobs use docker-compose-test.yml instead of service containers. +# This provides better control over container lifecycle and cache extraction. + +.hafbe_test_base: + extends: .docker_image_builder_job_template stage: test + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 needs: - job: quick_test_setup artifacts: true optional: true - job: sync artifacts: true - - job: docker-setup-docker-image-build - artifacts: true + - job: docker-ci-runner-build - job: prepare_haf_image artifacts: true - optional: true # Not needed in QUICK_TEST mode + optional: true rules: - if: $DOCS_ONLY == "true" when: never - when: on_success variables: - # Use recursive to properly handle nested submodules (haf/hive) + # Test data directories - extracted from sync cache + HAF_DATA_DIRECTORY: "${CI_PROJECT_DIR}/test-data" + HAF_SHM_DIRECTORY: "${CI_PROJECT_DIR}/test-shm" + COMPOSE_FILE: "docker/docker-compose-test.yml" + # Container hostnames in docker-compose network + HAF_HOST: "haf" + POSTGREST_HOST: "postgrest" + # Use recursive for nested submodules (haf/hive) GIT_SUBMODULE_STRATEGY: recursive HAF_APP_SCHEMA: "hafbe_app" - services: - - *haf-instance-with-nfs-fallback + before_script: + - !reference [.docker_image_builder_job_template, before_script] + # Fetch cache-manager + - !reference [.fetch_cache_manager, script] + # Create data directories + - mkdir -p "${HAF_DATA_DIRECTORY}" "${HAF_SHM_DIRECTORY}" + # Extract sync cache + - | + echo -e "\e[0Ksection_start:$(date +%s):extract_cache[collapsed=true]\r\e[0KExtracting sync cache..." + export SKIP_WAIT=true # Don't wait for postgres yet + ./scripts/ci-helpers/extract-cache-and-wait.sh "${HAFBE_SYNC_CACHE_TYPE}" "${HAFBE_CACHE_KEY}" "${HAF_DATA_DIRECTORY}" + echo -e "\e[0Ksection_end:$(date +%s):extract_cache\r\e[0K" + # Start docker-compose + - | + echo -e "\e[0Ksection_start:$(date +%s):compose_up[collapsed=true]\r\e[0KStarting test environment..." + docker compose -f "${COMPOSE_FILE}" up -d + echo -e "\e[0Ksection_end:$(date +%s):compose_up\r\e[0K" + # Wait for services to be ready + - | + echo -e "\e[0Ksection_start:$(date +%s):wait_services[collapsed=true]\r\e[0KWaiting for services..." + ./scripts/ci-helpers/wait-for-postgrest.sh + echo -e "\e[0Ksection_end:$(date +%s):wait_services\r\e[0K" + after_script: + - | + echo -e "\e[0Ksection_start:$(date +%s):compose_down[collapsed=true]\r\e[0KStopping test environment..." + docker compose -f "${COMPOSE_FILE:-docker/docker-compose-test.yml}" logs > docker/container-logs.txt 2>&1 || true + docker compose -f "${COMPOSE_FILE:-docker/docker-compose-test.yml}" down -v || true + # Cleanup test data + sudo rm -rf "${HAF_DATA_DIRECTORY:-/tmp/test-data}" "${HAF_SHM_DIRECTORY:-/tmp/test-shm}" || true + echo -e "\e[0Ksection_end:$(date +%s):compose_down\r\e[0K" + artifacts: + paths: + - docker/container-logs.txt + when: always + expire_in: 1 week + tags: + - data-cache-storage + - fast + +# ============================================================================= +# TEST JOBS +# ============================================================================= + +regression-test: + extends: .hafbe_test_base + # Use BUILDER_IMAGE with Python (ci-runner doesn't have python3 needed for data_insertion_script.py) + image: $BUILDER_IMAGE_PATH script: - - | - echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning tests..." + - | + echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning regression tests..." - cd tests/account_parameters - ./accounts_dump_test.sh --host=haf-instance + cd tests/account_parameters + ./accounts_dump_test.sh --host=${HAF_HOST} - cd ../witness_parameters - ./witnesses_dump_test.sh --host=haf-instance + cd ../witness_parameters + ./witnesses_dump_test.sh --host=${HAF_HOST} - echo -e "\e[0Ksection_end:$(date +%s):tests\r\e[0K" + echo -e "\e[0Ksection_end:$(date +%s):tests\r\e[0K" artifacts: paths: - - tests/account_parameters/account_dump_test.log - - tests/witness_parameters/witness_dump_test.log + - docker/container-logs.txt + - tests/account_parameters/account_dump_test.log + - tests/witness_parameters/witness_dump_test.log when: always - tags: - - data-cache-storage - - fast setup-scripts-test: - extends: .wait-for-haf-postgres-with-nfs - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 - stage: test - needs: - - job: quick_test_setup - artifacts: true - optional: true - - job: sync - artifacts: true - - job: docker-setup-docker-image-build - artifacts: true - - job: prepare_haf_image - artifacts: true - optional: true - rules: - - if: $DOCS_ONLY == "true" - when: never - - when: on_success - variables: - # Use recursive to properly handle nested submodules (haf/hive) - GIT_SUBMODULE_STRATEGY: recursive - HAF_APP_SCHEMA: "hafbe_app" - services: - - *haf-instance-with-nfs-fallback + extends: .hafbe_test_base script: - - | - echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning tests..." + - | + echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning functional tests..." - cd tests/functional - ./test_scripts.sh --host=haf-instance + cd tests/functional + ./test_scripts.sh --host=${HAF_HOST} - echo -e "\e[0Ksection_end:$(date +%s):tests\r\e[0K" - tags: - - data-cache-storage - - fast + echo -e "\e[0Ksection_end:$(date +%s):tests\r\e[0K" performance-test: - extends: .wait-for-haf-postgres-with-nfs + extends: .hafbe_test_base # Use BUILDER_IMAGE with Python (ci-runner doesn't have python3 needed for generate_db.py) image: $BUILDER_IMAGE_PATH - stage: test - needs: - - job: quick_test_setup - artifacts: true - optional: true - - job: sync - artifacts: true - - job: docker-setup-docker-image-build - artifacts: true - - job: prepare_haf_image - artifacts: true - optional: true - rules: - - if: $DOCS_ONLY == "true" - when: never - - when: on_success - variables: - # Use recursive to properly handle nested submodules (haf/hive) - GIT_SUBMODULE_STRATEGY: recursive - HAF_APP_SCHEMA: "hafbe_app" - services: - - *haf-instance-with-nfs-fallback - - *postgrest-service script: - - | - echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning tests..." + - | + echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning performance tests..." - timeout -k 1m 15m ./tests/run_performance_tests.sh --postgresql-host=haf-instance --postgrest-host=postgrest-server --database-size=6000 --test-loop-count=1000 - tar -cf - $(pwd)/tests/performance/result* | 7z a -si -mx9 tests/performance/results.tar.7z - cat jmeter.log | python3 docker/ci/parse-jmeter-output.py - m2u --input $(pwd)/tests/performance/result/result.xml --output $(pwd)/tests/performance/junit-result.xml + timeout -k 1m 15m ./tests/run_performance_tests.sh --postgresql-host=${HAF_HOST} --postgrest-host=${POSTGREST_HOST} --database-size=6000 --test-loop-count=1000 + tar -cf - $(pwd)/tests/performance/result* | 7z a -si -mx9 tests/performance/results.tar.7z + cat jmeter.log | python3 docker/ci/parse-jmeter-output.py + m2u --input $(pwd)/tests/performance/result/result.xml --output $(pwd)/tests/performance/junit-result.xml - echo -e "\e[0Ksection_end:$(date +%s):tests\r\e[0K" + echo -e "\e[0Ksection_end:$(date +%s):tests\r\e[0K" artifacts: paths: - - tests/performance/result/result_report/ - - tests/performance/results.tar.7z - - jmeter.log + - docker/container-logs.txt + - tests/performance/result/result_report/ + - tests/performance/results.tar.7z + - jmeter.log when: always reports: junit: tests/performance/junit-result.xml - tags: - - data-cache-storage - - fast pattern-test: - extends: .pytest_based_template - stage: test - needs: - - job: quick_test_setup - artifacts: true - optional: true - - job: sync - artifacts: true - - job: docker-setup-docker-image-build - artifacts: true - - job: prepare_haf_image - artifacts: true - optional: true - rules: - - if: $DOCS_ONLY == "true" - when: never - - when: on_success - services: - - *haf-instance-with-nfs-fallback - - *postgrest-service + extends: + - .hafbe_test_base + - .pytest_based_template + image: $BUILDER_IMAGE_PATH variables: - # Need recursive for nested haf/hive submodule (hive-local-tools) - GIT_SUBMODULE_STRATEGY: recursive JUNIT_REPORT: $CI_PROJECT_DIR/tests/tavern/report.xml PYTEST_BASED_IMAGE_NAME: $BUILDER_IMAGE_PATH POETRY_INSTALL_ROOT_DIR: $CI_PROJECT_DIR/submodules/haf/hive/tests/python/hive-local-tools - HAFBE_ADDRESS: postgrest-server + HAFBE_ADDRESS: ${POSTGREST_HOST} HAFBE_PORT: 3000 TAVERN_DIR: $CI_PROJECT_DIR/tests/tavern - HAF_APP_SCHEMA: "hafbe_app" before_script: - - !reference [.pytest_based_template, before_script] - - !reference [.wait-for-haf-postgres-with-nfs, before_script] + - !reference [.hafbe_test_base, before_script] + - !reference [.pytest_based_template, before_script] script: - - | - cd $CI_PROJECT_DIR/tests/tavern - pytest -n $PYTEST_NUMBER_OF_PROCESSES --junitxml report.xml . + - | + echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning Tavern API tests..." + + cd $CI_PROJECT_DIR/tests/tavern + pytest -n $PYTEST_NUMBER_OF_PROCESSES --junitxml report.xml . + + echo -e "\e[0Ksection_end:$(date +%s):tests\r\e[0K" artifacts: paths: - - "**/*.out.json" - tags: - - data-cache-storage - - fast + - docker/container-logs.txt + - "**/*.out.json" + when: always + reports: + junit: $CI_PROJECT_DIR/tests/tavern/report.xml build_and_publish_image: stage: publish diff --git a/docker/docker-compose-test.yml b/docker/docker-compose-test.yml new file mode 100644 index 00000000..ae66c810 --- /dev/null +++ b/docker/docker-compose-test.yml @@ -0,0 +1,97 @@ +# Lightweight Docker Compose for running tests against pre-synced data. +# This compose file is used by CI test jobs that have already extracted +# the synced HAF + haf_block_explorer data from cache. +# +# Unlike the main docker-compose.yml, this does NOT: +# - Run HAF replay (data is pre-synced) +# - Run app-setup, backend-setup, or block-processing services +# - Include dev tools (swagger, pghero, pgadmin) +# +# Usage: +# export HAF_DATA_DIRECTORY=/path/to/extracted/cache +# export HAF_SHM_DIRECTORY=/path/to/shm +# docker compose -f docker-compose-test.yml up -d +# +# The HAF_DATA_DIRECTORY should contain: +# - datadir/pgdata (PostgreSQL database with synced data) +# - datadir/blockchain (block_log files) + +name: 'haf-be-test' + +services: + haf: + image: ${HAF_IMAGE_NAME:-registry.gitlab.syncad.com/hive/haf/instance:latest} + # No replay - just start PostgreSQL with existing data + entrypoint: /home/haf_admin/docker_entrypoint.sh + command: "" + environment: + HIVED_UID: ${HIVED_UID:-0} + DATADIR: /home/hived/datadir + SHM_DIR: /home/hived/shm_dir + PGCTLTIMEOUT: 600 + # Trust all connections in test environment + PG_ACCESS: | + host all haf_admin all trust + host all haf_app_admin all trust + host all hafbe_owner all trust + host all hafbe_user all trust + host all btracker_owner all trust + host all reptracker_owner all trust + host all pghero all trust + volumes: + - haf_datadir:/home/hived/datadir + - haf_shmdir:/home/hived/shm_dir + networks: + - haf-network-test + healthcheck: + test: ["CMD-SHELL", "psql -U haf_admin -d haf_block_log -c 'SELECT 1' || exit 1"] + interval: 10s + timeout: 5s + retries: 30 + start_period: 60s + ports: + - "5432:5432" + + postgrest: + image: ${POSTGREST_REGISTRY:-postgrest/postgrest}:${POSTGREST_VERSION:-latest} + depends_on: + haf: + condition: service_healthy + environment: + PGRST_ADMIN_SERVER_PORT: 3001 + PGRST_SERVER_PORT: 3000 + PGRST_DB_URI: postgresql://hafbe_owner@haf:5432/haf_block_log + PGRST_DB_SCHEMA: hafbe_endpoints + PGRST_DB_ANON_ROLE: hafbe_user + PGRST_DB_POOL: 20 + PGRST_DB_POOL_ACQUISITION_TIMEOUT: 10 + PGRST_DB_EXTRA_SEARCH_PATH: hafbe_bal, reptracker_app + networks: + - haf-network-test + ports: + - "3000:3000" + - "3001:3001" + healthcheck: + test: ["CMD-SHELL", "curl -sf http://localhost:3001/ready || exit 1"] + interval: 5s + timeout: 3s + retries: 30 + start_period: 30s + +networks: + haf-network-test: + name: haf-network-test-${CI_JOB_ID:-local} + +volumes: + haf_datadir: + driver: local + driver_opts: + o: bind + type: none + device: ${HAF_DATA_DIRECTORY:-/tmp/haf_data} + haf_shmdir: + driver: local + driver_opts: + o: bind + type: none + device: ${HAF_SHM_DIRECTORY:-/tmp/haf_shm} diff --git a/scripts/ci-helpers/extract-cache-and-wait.sh b/scripts/ci-helpers/extract-cache-and-wait.sh new file mode 100755 index 00000000..465b1d68 --- /dev/null +++ b/scripts/ci-helpers/extract-cache-and-wait.sh @@ -0,0 +1,155 @@ +#!/bin/bash +# Extract HAF Block Explorer sync cache and prepare for testing. +# +# This script handles cache extraction from NFS with proper race condition +# handling using marker files. It's designed to be called before starting +# docker-compose in test jobs. +# +# Usage: +# extract-cache-and-wait.sh +# +# Arguments: +# cache-type - Cache type (e.g., haf_hafbe_sync) +# cache-key - Cache key (e.g., ${HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}) +# dest-dir - Destination directory for extracted cache +# +# Environment variables: +# CACHE_MANAGER - Path to cache-manager.sh (required) +# CI_PIPELINE_ID - GitLab pipeline ID (for marker file) +# EXTRACT_TIMEOUT - Timeout in seconds (default: 300) +# SKIP_WAIT - Set to "true" to skip PostgreSQL wait +# POSTGRES_HOST - PostgreSQL host for readiness check (default: localhost) +# POSTGRES_PORT - PostgreSQL port (default: 5432) +# +# Marker file pattern: +# ${dest-dir}/.ready - Contains pipeline ID that performed extraction +# +# Exit codes: +# 0 - Success (cache extracted or already available) +# 1 - Error (cache not found, extraction failed, or timeout) + +set -euo pipefail + +# Arguments +CACHE_TYPE="${1:?Usage: extract-cache-and-wait.sh }" +CACHE_KEY="${2:?Usage: extract-cache-and-wait.sh }" +DEST_DIR="${3:?Usage: extract-cache-and-wait.sh }" + +# Configuration +MARKER_FILE="${DEST_DIR}/.ready" +TIMEOUT="${EXTRACT_TIMEOUT:-300}" +POSTGRES_HOST="${POSTGRES_HOST:-localhost}" +POSTGRES_PORT="${POSTGRES_PORT:-5432}" +SKIP_WAIT="${SKIP_WAIT:-false}" + +echo "=== HAF Block Explorer Cache Extraction ===" +echo "Cache type: ${CACHE_TYPE}" +echo "Cache key: ${CACHE_KEY}" +echo "Dest dir: ${DEST_DIR}" +echo "Pipeline: ${CI_PIPELINE_ID:-local}" +echo "" + +# Verify cache-manager is available +if [[ -z "${CACHE_MANAGER:-}" ]]; then + echo "ERROR: CACHE_MANAGER environment variable not set" + echo "Ensure .fetch_cache_manager has been run first" + exit 1 +fi + +if [[ ! -x "$CACHE_MANAGER" ]]; then + echo "ERROR: Cache manager not found or not executable: $CACHE_MANAGER" + exit 1 +fi + +# Check if extraction already done for this pipeline +if [[ -f "$MARKER_FILE" ]]; then + MARKER_PIPELINE=$(cat "$MARKER_FILE" 2>/dev/null || echo "") + if [[ "$MARKER_PIPELINE" == "${CI_PIPELINE_ID:-local}" ]]; then + echo "Cache already extracted for this pipeline (marker: ${MARKER_PIPELINE})" + echo "Skipping extraction" + exit 0 + fi + echo "Marker file exists but for different pipeline: ${MARKER_PIPELINE}" +fi + +# Check if data directory already has valid data +PGDATA="${DEST_DIR}/datadir/pgdata" +if [[ -d "$PGDATA" ]]; then + echo "Data directory exists: $PGDATA" + # Check if PostgreSQL data looks valid + if [[ -f "$PGDATA/PG_VERSION" ]]; then + echo "PostgreSQL data appears valid (PG_VERSION exists)" + # Update marker file + echo "${CI_PIPELINE_ID:-local}" > "$MARKER_FILE" + echo "Updated marker file, skipping extraction" + exit 0 + fi + echo "PostgreSQL data incomplete, will re-extract" +fi + +# Create destination directory +mkdir -p "${DEST_DIR}" + +# Extract cache using cache-manager +echo "" +echo "=== Extracting Cache ===" +echo "Running: CACHE_HANDLING=haf \$CACHE_MANAGER get ${CACHE_TYPE} ${CACHE_KEY} ${DEST_DIR}" + +if ! CACHE_HANDLING=haf "$CACHE_MANAGER" get "${CACHE_TYPE}" "${CACHE_KEY}" "${DEST_DIR}"; then + echo "" + echo "ERROR: Cache extraction failed" + echo "" + echo "Possible causes:" + echo " - Cache does not exist for key: ${CACHE_KEY}" + echo " - NFS not mounted or not accessible" + echo " - Sync job did not complete successfully" + echo "" + echo "Debug commands:" + echo " ls -la ${DATA_CACHE_NFS_PREFIX:-/nfs/ci-cache}/${CACHE_TYPE}/ | head -10" + echo " ls -la /nfs/ci-cache/${CACHE_TYPE}/${CACHE_KEY}.tar" + exit 1 +fi + +echo "Cache extracted successfully" + +# Fix PostgreSQL permissions (must be 700 for pg_ctl to work) +if [[ -d "$PGDATA" ]]; then + echo "" + echo "=== Fixing PostgreSQL Permissions ===" + chmod 700 "$PGDATA" + echo "Set $PGDATA permissions to 700" +fi + +# Write marker file +echo "${CI_PIPELINE_ID:-local}" > "$MARKER_FILE" +echo "Wrote marker file: $MARKER_FILE" + +# Optionally wait for PostgreSQL +if [[ "$SKIP_WAIT" == "true" ]]; then + echo "" + echo "Skipping PostgreSQL wait (SKIP_WAIT=true)" + exit 0 +fi + +echo "" +echo "=== Waiting for PostgreSQL ===" +echo "Host: ${POSTGRES_HOST}:${POSTGRES_PORT}" +echo "Timeout: ${TIMEOUT}s" + +WAITED=0 +while ! pg_isready -h "${POSTGRES_HOST}" -p "${POSTGRES_PORT}" -q 2>/dev/null; do + sleep 5 + WAITED=$((WAITED + 5)) + if [[ $WAITED -ge $TIMEOUT ]]; then + echo "" + echo "ERROR: PostgreSQL not ready after ${TIMEOUT}s" + echo "This may be normal if the container hasn't started yet." + echo "If running in CI, ensure docker-compose is started after this script." + # Exit 0 here since we extracted successfully - postgres will start later + exit 0 + fi + echo "Waiting for PostgreSQL... (${WAITED}s)" +done + +echo "PostgreSQL ready after ${WAITED}s" +exit 0 diff --git a/scripts/ci-helpers/skip_rules.yml b/scripts/ci-helpers/skip_rules.yml index e488bc49..5915fada 100644 --- a/scripts/ci-helpers/skip_rules.yml +++ b/scripts/ci-helpers/skip_rules.yml @@ -18,91 +18,40 @@ variables: # ============================================================================ # Change Detection Job # ============================================================================ +# Uses the common-ci-configuration .haf_app_detect_changes template with +# customizations for backward compatibility. -.detect_changes: - stage: lint - image: alpine:latest - needs: [] +detect_changes: + extends: .haf_app_detect_changes variables: - # Job only checks git diff, doesn't need submodules - GIT_SUBMODULE_STRATEGY: none - before_script: - - apk add --no-cache git + # Skip patterns: files matching these don't require HAF sync + # (tests, docs, markdown, readme, changelog, license, claude instructions) + HAF_APP_SKIP_PATTERNS: '^tests/|^docs/|\.md$|^README|^CHANGELOG|^LICENSE|^CLAUDE' script: + # Run the parent template's detection first + - !reference [.haf_app_detect_changes, script] + # Add backward-compatible DOCS_ONLY variable for existing rule templates + # AUTO_SKIP_SYNC from common template = DOCS_ONLY in our rule templates - | - echo "Detecting what files changed..." - - # Determine base commit to compare against - if [ -n "$CI_MERGE_REQUEST_DIFF_BASE_SHA" ]; then - BASE_SHA="$CI_MERGE_REQUEST_DIFF_BASE_SHA" - echo "Using MR diff base: $BASE_SHA" - elif [ -n "$CI_COMMIT_BEFORE_SHA" ] && [ "$CI_COMMIT_BEFORE_SHA" != "0000000000000000000000000000000000000000" ]; then - BASE_SHA="$CI_COMMIT_BEFORE_SHA" - echo "Using commit before SHA: $BASE_SHA" - else - BASE_SHA="HEAD~1" - echo "Using HEAD~1 as base" - fi - - # Get list of changed files - echo "Changed files:" - git diff --name-only "$BASE_SHA" HEAD 2>/dev/null || git diff --name-only HEAD~1 HEAD | head -50 - - # Check if source code changed (files that require builds/sync) - SOURCE_CHANGED="false" - if git diff --name-only "$BASE_SHA" HEAD 2>/dev/null | grep -qE '^(backend/|endpoints/|docker/|scripts/|submodules/|Dockerfile|\.gitlab-ci\.yml)'; then - SOURCE_CHANGED="true" - fi - - # Check if tests changed - TESTS_CHANGED="false" - if git diff --name-only "$BASE_SHA" HEAD 2>/dev/null | grep -qE '^tests/'; then - TESTS_CHANGED="true" + if [ -f detect_changes.env ]; then + AUTO_SKIP=$(grep "AUTO_SKIP_SYNC=" detect_changes.env | cut -d= -f2) + echo "DOCS_ONLY=${AUTO_SKIP}" >> detect_changes.env + echo "" + echo "=== Backward Compatibility ===" + echo "DOCS_ONLY=${AUTO_SKIP} (alias for AUTO_SKIP_SYNC)" fi - - # Determine if this is a docs-only change - # Docs-only means: only docs/readme/misc files changed, no source code or tests - DOCS_ONLY="false" - if [ "$SOURCE_CHANGED" = "false" ] && [ "$TESTS_CHANGED" = "false" ]; then - echo "No source or test changes detected" - # Verify something actually changed (not empty commit) - if git diff --name-only "$BASE_SHA" HEAD 2>/dev/null | grep -q .; then - DOCS_ONLY="true" - echo ">>> DOCS_ONLY=true - skipping builds and tests" - fi - fi - - echo "" - echo "Detection results:" - echo " SOURCE_CHANGED=$SOURCE_CHANGED" - echo " TESTS_CHANGED=$TESTS_CHANGED" - echo " DOCS_ONLY=$DOCS_ONLY" - - # Write to dotenv file for other jobs - echo "DOCS_ONLY=$DOCS_ONLY" > detect_changes.env - echo "SOURCE_CHANGED=$SOURCE_CHANGED" >> detect_changes.env - artifacts: - reports: - dotenv: detect_changes.env rules: # Skip detection on protected branches - always run full pipeline - if: $CI_COMMIT_BRANCH == "develop" || $CI_COMMIT_BRANCH == "master" when: never - # Skip on tags + # Skip on tags (common template handles this too, but explicit for clarity) - if: $CI_COMMIT_TAG when: never # Skip if forcing full pipeline - if: $FORCE_FULL_PIPELINE == "true" when: never - # Skip if QUICK_TEST is manually enabled - - if: $QUICK_TEST == "true" - when: never + # Run for all other cases - when: on_success - tags: - - public-runner-docker - -detect_changes: - extends: .detect_changes # ============================================================================ # Rule Templates for Jobs diff --git a/scripts/ci-helpers/wait-for-postgrest.sh b/scripts/ci-helpers/wait-for-postgrest.sh new file mode 100755 index 00000000..fac2728e --- /dev/null +++ b/scripts/ci-helpers/wait-for-postgrest.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Wait for HAF and PostgREST to be ready in docker-compose test environment. +# +# This script waits for both services to be healthy before tests can run: +# 1. HAF (PostgreSQL) - database must be accepting connections +# 2. PostgREST - REST API must be responding +# +# Usage: +# wait-for-postgrest.sh [options] +# +# Environment variables: +# HAF_HOST - HAF/PostgreSQL hostname (default: haf) +# HAF_PORT - PostgreSQL port (default: 5432) +# POSTGREST_HOST - PostgREST hostname (default: postgrest) +# POSTGREST_PORT - PostgREST port (default: 3000) +# POSTGREST_ADMIN_PORT - PostgREST admin port (default: 3001) +# WAIT_TIMEOUT - Total timeout in seconds (default: 300) + +set -euo pipefail + +HAF_HOST="${HAF_HOST:-haf}" +HAF_PORT="${HAF_PORT:-5432}" +POSTGREST_HOST="${POSTGREST_HOST:-postgrest}" +POSTGREST_PORT="${POSTGREST_PORT:-3000}" +POSTGREST_ADMIN_PORT="${POSTGREST_ADMIN_PORT:-3001}" +TIMEOUT="${WAIT_TIMEOUT:-300}" + +echo "=== Waiting for Test Services ===" +echo "HAF: ${HAF_HOST}:${HAF_PORT}" +echo "PostgREST: ${POSTGREST_HOST}:${POSTGREST_PORT} (admin: ${POSTGREST_ADMIN_PORT})" +echo "Timeout: ${TIMEOUT}s" +echo "" + +WAITED=0 + +# Wait for HAF/PostgreSQL +echo "--- Waiting for HAF (PostgreSQL) ---" +while ! pg_isready -h "${HAF_HOST}" -p "${HAF_PORT}" -q 2>/dev/null; do + sleep 5 + WAITED=$((WAITED + 5)) + if [[ $WAITED -ge $TIMEOUT ]]; then + echo "ERROR: HAF not ready after ${TIMEOUT}s" + exit 1 + fi + echo "Waiting for HAF... (${WAITED}s)" +done +echo "HAF ready after ${WAITED}s" +echo "" + +# Wait for PostgREST +echo "--- Waiting for PostgREST ---" +while ! curl -sf "http://${POSTGREST_HOST}:${POSTGREST_ADMIN_PORT}/ready" >/dev/null 2>&1; do + sleep 5 + WAITED=$((WAITED + 5)) + if [[ $WAITED -ge $TIMEOUT ]]; then + echo "ERROR: PostgREST not ready after ${TIMEOUT}s" + echo "" + echo "Debug info:" + echo " curl -v http://${POSTGREST_HOST}:${POSTGREST_ADMIN_PORT}/ready" + echo " docker compose logs postgrest" + exit 1 + fi + echo "Waiting for PostgREST... (${WAITED}s)" +done +echo "PostgREST ready after ${WAITED}s" +echo "" + +# Verify API is responding +echo "--- Verifying API ---" +if curl -sf "http://${POSTGREST_HOST}:${POSTGREST_PORT}/" >/dev/null 2>&1; then + echo "API responding at http://${POSTGREST_HOST}:${POSTGREST_PORT}/" +else + echo "WARNING: API root not responding, but admin endpoint is ready" +fi + +echo "" +echo "=== All Services Ready ===" +exit 0 -- GitLab From 95dcb7e4e2a5ab02fd3ca1dacf3ccc350b614ee5 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 17:01:18 -0500 Subject: [PATCH 076/108] Fix !reference: .fetch_cache_manager uses before_script not script The .fetch_cache_manager template in common-ci-configuration defines a before_script block, not script. This caused the test jobs to fail with '!reference could not be found' when creating the pipeline. --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 46593537..7b5d5a1d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -687,7 +687,7 @@ python_api_client_test: before_script: - !reference [.docker_image_builder_job_template, before_script] # Fetch cache-manager - - !reference [.fetch_cache_manager, script] + - !reference [.fetch_cache_manager, before_script] # Create data directories - mkdir -p "${HAF_DATA_DIRECTORY}" "${HAF_SHM_DIRECTORY}" # Extract sync cache -- GitLab From 4d00889d52b291034df7b30235e1909aa673f04d Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 17:34:15 -0500 Subject: [PATCH 077/108] Fix test jobs: volume path, docker setup, and Python installation - Fix docker-compose-test.yml volume path: add /datadir to bind mount (cache structure: HAF_DATA_DIRECTORY/datadir/haf_db_store/pgdata) - Remove docker buildx setup from test jobs (not needed for docker-compose) Just use git safe.directory setup instead - Add DOCKER_HOST and DOCKER_TLS_CERTDIR for proper dind connection - Install Python3 in before_script for Alpine-based ci-runner image - Remove image overrides from test jobs (use ci-runner with docker-compose) --- .gitlab-ci.yml | 20 +++++++++++++------- docker/docker-compose-test.yml | 3 ++- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7b5d5a1d..75a66539 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -674,6 +674,9 @@ python_api_client_test: when: never - when: on_success variables: + # Docker-in-docker connection (disable TLS for simplicity) + DOCKER_TLS_CERTDIR: "" + DOCKER_HOST: "tcp://docker:2375" # Test data directories - extracted from sync cache HAF_DATA_DIRECTORY: "${CI_PROJECT_DIR}/test-data" HAF_SHM_DIRECTORY: "${CI_PROJECT_DIR}/test-shm" @@ -685,7 +688,14 @@ python_api_client_test: GIT_SUBMODULE_STRATEGY: recursive HAF_APP_SCHEMA: "hafbe_app" before_script: - - !reference [.docker_image_builder_job_template, before_script] + # Git setup (from docker_image_builder_job_template, without buildx) + - git config --global --add safe.directory '*' + # Install Python if not available (Alpine-based ci-runner image) + - | + if ! command -v python3 &> /dev/null; then + echo "Installing Python3..." + apk add --no-cache python3 py3-pip + fi # Fetch cache-manager - !reference [.fetch_cache_manager, before_script] # Create data directories @@ -729,8 +739,6 @@ python_api_client_test: regression-test: extends: .hafbe_test_base - # Use BUILDER_IMAGE with Python (ci-runner doesn't have python3 needed for data_insertion_script.py) - image: $BUILDER_IMAGE_PATH script: - | echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning regression tests..." @@ -762,8 +770,6 @@ setup-scripts-test: performance-test: extends: .hafbe_test_base - # Use BUILDER_IMAGE with Python (ci-runner doesn't have python3 needed for generate_db.py) - image: $BUILDER_IMAGE_PATH script: - | echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning performance tests..." @@ -788,16 +794,16 @@ pattern-test: extends: - .hafbe_test_base - .pytest_based_template - image: $BUILDER_IMAGE_PATH variables: JUNIT_REPORT: $CI_PROJECT_DIR/tests/tavern/report.xml - PYTEST_BASED_IMAGE_NAME: $BUILDER_IMAGE_PATH POETRY_INSTALL_ROOT_DIR: $CI_PROJECT_DIR/submodules/haf/hive/tests/python/hive-local-tools HAFBE_ADDRESS: ${POSTGREST_HOST} HAFBE_PORT: 3000 TAVERN_DIR: $CI_PROJECT_DIR/tests/tavern before_script: - !reference [.hafbe_test_base, before_script] + # Install poetry for pytest template (Alpine packages) + - pip3 install --break-system-packages poetry - !reference [.pytest_based_template, before_script] script: - | diff --git a/docker/docker-compose-test.yml b/docker/docker-compose-test.yml index ae66c810..d22543fb 100644 --- a/docker/docker-compose-test.yml +++ b/docker/docker-compose-test.yml @@ -88,7 +88,8 @@ volumes: driver_opts: o: bind type: none - device: ${HAF_DATA_DIRECTORY:-/tmp/haf_data} + # Cache structure: HAF_DATA_DIRECTORY/datadir/haf_db_store/pgdata + device: ${HAF_DATA_DIRECTORY:-/tmp/haf_data}/datadir haf_shmdir: driver: local driver_opts: -- GitLab From bce85772701f8defde2c5f9e0ddbaaeeeba8d72d Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 17:47:16 -0500 Subject: [PATCH 078/108] Install docker CLI in test jobs instead of Python in ci-runner The ci-runner image runs as non-root so apk add fails. Instead: - Use BUILDER_IMAGE_PATH (has Python) for test jobs that need Python - Install Docker CLI and docker-compose in before_script for those jobs - Download static Docker binary and docker-compose from GitHub releases --- .gitlab-ci.yml | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 75a66539..a6a61522 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -690,11 +690,18 @@ python_api_client_test: before_script: # Git setup (from docker_image_builder_job_template, without buildx) - git config --global --add safe.directory '*' - # Install Python if not available (Alpine-based ci-runner image) + # Install docker-compose if not available (for non-docker images like BUILDER_IMAGE) - | - if ! command -v python3 &> /dev/null; then - echo "Installing Python3..." - apk add --no-cache python3 py3-pip + if ! command -v docker &> /dev/null; then + echo "Installing Docker CLI and Compose..." + DOCKER_VERSION="27.3.1" + curl -fsSL "https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz" | tar xz -C /tmp + mv /tmp/docker/docker /usr/local/bin/ + + COMPOSE_VERSION="v2.30.3" + curl -fsSL "https://github.com/docker/compose/releases/download/${COMPOSE_VERSION}/docker-compose-linux-x86_64" -o /usr/local/bin/docker-compose + chmod +x /usr/local/bin/docker-compose + ln -sf /usr/local/bin/docker-compose /usr/local/bin/docker-compose fi # Fetch cache-manager - !reference [.fetch_cache_manager, before_script] @@ -739,6 +746,7 @@ python_api_client_test: regression-test: extends: .hafbe_test_base + image: $BUILDER_IMAGE_PATH script: - | echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning regression tests..." @@ -770,6 +778,7 @@ setup-scripts-test: performance-test: extends: .hafbe_test_base + image: $BUILDER_IMAGE_PATH script: - | echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning performance tests..." @@ -794,6 +803,7 @@ pattern-test: extends: - .hafbe_test_base - .pytest_based_template + image: $BUILDER_IMAGE_PATH variables: JUNIT_REPORT: $CI_PROJECT_DIR/tests/tavern/report.xml POETRY_INSTALL_ROOT_DIR: $CI_PROJECT_DIR/submodules/haf/hive/tests/python/hive-local-tools -- GitLab From db93af5519cb0a856fec7f874cda0667cb932e80 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 18:50:48 -0500 Subject: [PATCH 079/108] Fix docker-compose: use standalone command with wrapper for plugin compatibility - Split docker CLI and compose installation into separate checks - Create wrapper script when docker compose plugin available but docker-compose not - Change test job invocations from 'docker compose' to 'docker-compose' - Handles both ci-runner (has plugin) and BUILDER_IMAGE (needs standalone) --- .gitlab-ci.yml | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a6a61522..7a94cb85 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -690,18 +690,26 @@ python_api_client_test: before_script: # Git setup (from docker_image_builder_job_template, without buildx) - git config --global --add safe.directory '*' - # Install docker-compose if not available (for non-docker images like BUILDER_IMAGE) + # Install docker and docker-compose if not available - | if ! command -v docker &> /dev/null; then - echo "Installing Docker CLI and Compose..." + echo "Installing Docker CLI..." DOCKER_VERSION="27.3.1" curl -fsSL "https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz" | tar xz -C /tmp mv /tmp/docker/docker /usr/local/bin/ - - COMPOSE_VERSION="v2.30.3" - curl -fsSL "https://github.com/docker/compose/releases/download/${COMPOSE_VERSION}/docker-compose-linux-x86_64" -o /usr/local/bin/docker-compose - chmod +x /usr/local/bin/docker-compose - ln -sf /usr/local/bin/docker-compose /usr/local/bin/docker-compose + fi + # Ensure docker-compose command is available (standalone or wrapper for plugin) + if ! command -v docker-compose &> /dev/null; then + if docker compose version &> /dev/null; then + echo "Creating docker-compose wrapper for docker compose plugin..." + printf '#!/bin/sh\nexec docker compose "$@"\n' > /usr/local/bin/docker-compose + chmod +x /usr/local/bin/docker-compose + else + echo "Installing docker-compose standalone..." + COMPOSE_VERSION="v2.30.3" + curl -fsSL "https://github.com/docker/compose/releases/download/${COMPOSE_VERSION}/docker-compose-linux-x86_64" -o /usr/local/bin/docker-compose + chmod +x /usr/local/bin/docker-compose + fi fi # Fetch cache-manager - !reference [.fetch_cache_manager, before_script] @@ -716,7 +724,7 @@ python_api_client_test: # Start docker-compose - | echo -e "\e[0Ksection_start:$(date +%s):compose_up[collapsed=true]\r\e[0KStarting test environment..." - docker compose -f "${COMPOSE_FILE}" up -d + docker-compose -f "${COMPOSE_FILE}" up -d echo -e "\e[0Ksection_end:$(date +%s):compose_up\r\e[0K" # Wait for services to be ready - | @@ -726,8 +734,8 @@ python_api_client_test: after_script: - | echo -e "\e[0Ksection_start:$(date +%s):compose_down[collapsed=true]\r\e[0KStopping test environment..." - docker compose -f "${COMPOSE_FILE:-docker/docker-compose-test.yml}" logs > docker/container-logs.txt 2>&1 || true - docker compose -f "${COMPOSE_FILE:-docker/docker-compose-test.yml}" down -v || true + docker-compose -f "${COMPOSE_FILE:-docker/docker-compose-test.yml}" logs > docker/container-logs.txt 2>&1 || true + docker-compose -f "${COMPOSE_FILE:-docker/docker-compose-test.yml}" down -v || true # Cleanup test data sudo rm -rf "${HAF_DATA_DIRECTORY:-/tmp/test-data}" "${HAF_SHM_DIRECTORY:-/tmp/test-shm}" || true echo -e "\e[0Ksection_end:$(date +%s):compose_down\r\e[0K" -- GitLab From 1a71f2a525468d4850ae87fab2a7634ded14b3d8 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 19:10:09 -0500 Subject: [PATCH 080/108] Fix docker-compose PATH for non-root images - Use CI_PROJECT_DIR/.local-bin for docker/compose binaries (writable by non-root) - Add PATH export to each script block that uses docker-compose - LOCAL_BIN variable defined in job variables for consistency --- .gitlab-ci.yml | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7a94cb85..63e81017 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -687,30 +687,35 @@ python_api_client_test: # Use recursive for nested submodules (haf/hive) GIT_SUBMODULE_STRATEGY: recursive HAF_APP_SCHEMA: "hafbe_app" + # Local bin for non-root docker/compose installation + LOCAL_BIN: "${CI_PROJECT_DIR}/.local-bin" before_script: # Git setup (from docker_image_builder_job_template, without buildx) - git config --global --add safe.directory '*' - # Install docker and docker-compose if not available + # Install docker and docker-compose if not available (use $LOCAL_BIN for non-root images) - | + mkdir -p "$LOCAL_BIN" + export PATH="$LOCAL_BIN:$PATH" if ! command -v docker &> /dev/null; then - echo "Installing Docker CLI..." + echo "Installing Docker CLI to $LOCAL_BIN..." DOCKER_VERSION="27.3.1" curl -fsSL "https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz" | tar xz -C /tmp - mv /tmp/docker/docker /usr/local/bin/ + mv /tmp/docker/docker "$LOCAL_BIN/" fi # Ensure docker-compose command is available (standalone or wrapper for plugin) if ! command -v docker-compose &> /dev/null; then if docker compose version &> /dev/null; then echo "Creating docker-compose wrapper for docker compose plugin..." - printf '#!/bin/sh\nexec docker compose "$@"\n' > /usr/local/bin/docker-compose - chmod +x /usr/local/bin/docker-compose + printf '#!/bin/sh\nexec docker compose "$@"\n' > "$LOCAL_BIN/docker-compose" + chmod +x "$LOCAL_BIN/docker-compose" else - echo "Installing docker-compose standalone..." + echo "Installing docker-compose standalone to $LOCAL_BIN..." COMPOSE_VERSION="v2.30.3" - curl -fsSL "https://github.com/docker/compose/releases/download/${COMPOSE_VERSION}/docker-compose-linux-x86_64" -o /usr/local/bin/docker-compose - chmod +x /usr/local/bin/docker-compose + curl -fsSL "https://github.com/docker/compose/releases/download/${COMPOSE_VERSION}/docker-compose-linux-x86_64" -o "$LOCAL_BIN/docker-compose" + chmod +x "$LOCAL_BIN/docker-compose" fi fi + echo "docker-compose path: $(command -v docker-compose)" # Fetch cache-manager - !reference [.fetch_cache_manager, before_script] # Create data directories @@ -723,6 +728,7 @@ python_api_client_test: echo -e "\e[0Ksection_end:$(date +%s):extract_cache\r\e[0K" # Start docker-compose - | + export PATH="$LOCAL_BIN:$PATH" echo -e "\e[0Ksection_start:$(date +%s):compose_up[collapsed=true]\r\e[0KStarting test environment..." docker-compose -f "${COMPOSE_FILE}" up -d echo -e "\e[0Ksection_end:$(date +%s):compose_up\r\e[0K" @@ -733,6 +739,7 @@ python_api_client_test: echo -e "\e[0Ksection_end:$(date +%s):wait_services\r\e[0K" after_script: - | + export PATH="$LOCAL_BIN:$PATH" echo -e "\e[0Ksection_start:$(date +%s):compose_down[collapsed=true]\r\e[0KStopping test environment..." docker-compose -f "${COMPOSE_FILE:-docker/docker-compose-test.yml}" logs > docker/container-logs.txt 2>&1 || true docker-compose -f "${COMPOSE_FILE:-docker/docker-compose-test.yml}" down -v || true -- GitLab From 36255260397d9ead2f9953a0ee340b0e6ac979a4 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 20:33:42 -0500 Subject: [PATCH 081/108] Fix HAF container permissions: run as root (HIVED_UID=0) The extracted cache is owned by the CI job user, but HAF container defaults to UID 4000. Setting HIVED_UID=0 runs the container as root to access the CI-extracted data. --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 63e81017..d08bf724 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -689,6 +689,8 @@ python_api_client_test: HAF_APP_SCHEMA: "hafbe_app" # Local bin for non-root docker/compose installation LOCAL_BIN: "${CI_PROJECT_DIR}/.local-bin" + # Run HAF container as root to access CI-extracted data + HIVED_UID: "0" before_script: # Git setup (from docker_image_builder_job_template, without buildx) - git config --global --add safe.directory '*' -- GitLab From 68cb56644735858ea1bd78108d1a30416b8f955c Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 21:14:33 -0500 Subject: [PATCH 082/108] Fix extracted data permissions for HAF container - Add explicit HIVED_UID export before docker-compose - chmod -R 777 on extracted data directory to ensure container can write - Debug log HIVED_UID value --- .gitlab-ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d08bf724..89b1ca4b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -731,6 +731,10 @@ python_api_client_test: # Start docker-compose - | export PATH="$LOCAL_BIN:$PATH" + export HIVED_UID="${HIVED_UID:-0}" + echo "HIVED_UID=$HIVED_UID (container will run as this UID)" + # Ensure container can write to extracted data (HAF runs as UID 4000 by default) + sudo chmod -R 777 "${HAF_DATA_DIRECTORY}" || chmod -R 777 "${HAF_DATA_DIRECTORY}" || true echo -e "\e[0Ksection_start:$(date +%s):compose_up[collapsed=true]\r\e[0KStarting test environment..." docker-compose -f "${COMPOSE_FILE}" up -d echo -e "\e[0Ksection_end:$(date +%s):compose_up\r\e[0K" -- GitLab From 93a21e3f563f92c5c43d726ba5b7d6cd4802f08f Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 21:39:20 -0500 Subject: [PATCH 083/108] Fix test compose: use --skip-hived to prevent blockchain replay The HAF container was running a full blockchain replay instead of just starting PostgreSQL with the cached data. Adding --skip-hived flag tells the entrypoint to skip hived and only start PostgreSQL. This matches the balance_tracker approach for test jobs. --- docker/docker-compose-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/docker-compose-test.yml b/docker/docker-compose-test.yml index d22543fb..d4ba7c83 100644 --- a/docker/docker-compose-test.yml +++ b/docker/docker-compose-test.yml @@ -23,7 +23,7 @@ services: image: ${HAF_IMAGE_NAME:-registry.gitlab.syncad.com/hive/haf/instance:latest} # No replay - just start PostgreSQL with existing data entrypoint: /home/haf_admin/docker_entrypoint.sh - command: "" + command: ["--skip-hived"] environment: HIVED_UID: ${HIVED_UID:-0} DATADIR: /home/hived/datadir -- GitLab From 2c647ab5edfce6ac294f99bc6dff1196d48b6193 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 22:01:42 -0500 Subject: [PATCH 084/108] Fix HAF/PostgREST hostnames: use localhost for dind access The wait script runs in the CI job container, not inside the docker-compose network. Since docker-compose exposes ports to the dind host, we need to connect via localhost, not the internal container hostnames. --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 89b1ca4b..91a7508c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -681,9 +681,9 @@ python_api_client_test: HAF_DATA_DIRECTORY: "${CI_PROJECT_DIR}/test-data" HAF_SHM_DIRECTORY: "${CI_PROJECT_DIR}/test-shm" COMPOSE_FILE: "docker/docker-compose-test.yml" - # Container hostnames in docker-compose network - HAF_HOST: "haf" - POSTGREST_HOST: "postgrest" + # Service hostnames - use localhost since ports are exposed to dind host + HAF_HOST: "localhost" + POSTGREST_HOST: "localhost" # Use recursive for nested submodules (haf/hive) GIT_SUBMODULE_STRATEGY: recursive HAF_APP_SCHEMA: "hafbe_app" -- GitLab From 47cf2169bd9421d3fd4f2adf7af89777eabde26a Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 22:22:40 -0500 Subject: [PATCH 085/108] Fix service hostnames: use 'docker' for dind service In dind setup, DOCKER_HOST=tcp://docker:2375 means the docker daemon runs on the 'docker' service. Ports exposed by docker-compose containers are accessible at docker:port, not localhost:port. --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 91a7508c..d077332f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -681,9 +681,9 @@ python_api_client_test: HAF_DATA_DIRECTORY: "${CI_PROJECT_DIR}/test-data" HAF_SHM_DIRECTORY: "${CI_PROJECT_DIR}/test-shm" COMPOSE_FILE: "docker/docker-compose-test.yml" - # Service hostnames - use localhost since ports are exposed to dind host - HAF_HOST: "localhost" - POSTGREST_HOST: "localhost" + # Service hostnames - use 'docker' since that's the dind service where containers run + HAF_HOST: "docker" + POSTGREST_HOST: "docker" # Use recursive for nested submodules (haf/hive) GIT_SUBMODULE_STRATEGY: recursive HAF_APP_SCHEMA: "hafbe_app" -- GitLab From 3137137f9b2cc25cb58ac1d1d13739f1aa8735b7 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 22:46:27 -0500 Subject: [PATCH 086/108] Fix wait-for-postgrest.sh to use docker-compose exec for DinD Use docker-compose exec to run pg_isready inside the HAF container instead of trying to reach the container from outside the docker network. For PostgREST, use the 'docker' hostname (dind service host) with exposed port. Also add PATH export in CI before calling wait script to ensure docker-compose is found. --- .gitlab-ci.yml | 1 + scripts/ci-helpers/wait-for-postgrest.sh | 49 ++++++++++-------------- 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d077332f..a512fde7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -740,6 +740,7 @@ python_api_client_test: echo -e "\e[0Ksection_end:$(date +%s):compose_up\r\e[0K" # Wait for services to be ready - | + export PATH="$LOCAL_BIN:$PATH" echo -e "\e[0Ksection_start:$(date +%s):wait_services[collapsed=true]\r\e[0KWaiting for services..." ./scripts/ci-helpers/wait-for-postgrest.sh echo -e "\e[0Ksection_end:$(date +%s):wait_services\r\e[0K" diff --git a/scripts/ci-helpers/wait-for-postgrest.sh b/scripts/ci-helpers/wait-for-postgrest.sh index fac2728e..b180e944 100755 --- a/scripts/ci-helpers/wait-for-postgrest.sh +++ b/scripts/ci-helpers/wait-for-postgrest.sh @@ -5,41 +5,42 @@ # 1. HAF (PostgreSQL) - database must be accepting connections # 2. PostgREST - REST API must be responding # -# Usage: -# wait-for-postgrest.sh [options] +# In DinD (Docker-in-Docker) environments: +# - HAF check uses 'docker-compose exec' to run pg_isready inside the container +# - PostgREST check uses 'docker' hostname (the dind service host) # # Environment variables: -# HAF_HOST - HAF/PostgreSQL hostname (default: haf) -# HAF_PORT - PostgreSQL port (default: 5432) -# POSTGREST_HOST - PostgREST hostname (default: postgrest) +# COMPOSE_FILE - Docker compose file path (required) +# POSTGREST_HOST - Host for PostgREST HTTP checks (default: docker) # POSTGREST_PORT - PostgREST port (default: 3000) -# POSTGREST_ADMIN_PORT - PostgREST admin port (default: 3001) # WAIT_TIMEOUT - Total timeout in seconds (default: 300) set -euo pipefail -HAF_HOST="${HAF_HOST:-haf}" -HAF_PORT="${HAF_PORT:-5432}" -POSTGREST_HOST="${POSTGREST_HOST:-postgrest}" +# Configuration +COMPOSE_FILE="${COMPOSE_FILE:?COMPOSE_FILE must be set}" +POSTGREST_HOST="${POSTGREST_HOST:-docker}" POSTGREST_PORT="${POSTGREST_PORT:-3000}" -POSTGREST_ADMIN_PORT="${POSTGREST_ADMIN_PORT:-3001}" TIMEOUT="${WAIT_TIMEOUT:-300}" echo "=== Waiting for Test Services ===" -echo "HAF: ${HAF_HOST}:${HAF_PORT}" -echo "PostgREST: ${POSTGREST_HOST}:${POSTGREST_PORT} (admin: ${POSTGREST_ADMIN_PORT})" -echo "Timeout: ${TIMEOUT}s" +echo "Compose file: ${COMPOSE_FILE}" +echo "PostgREST: http://${POSTGREST_HOST}:${POSTGREST_PORT}" +echo "Timeout: ${TIMEOUT}s" echo "" WAITED=0 -# Wait for HAF/PostgreSQL +# Wait for HAF/PostgreSQL using docker-compose exec echo "--- Waiting for HAF (PostgreSQL) ---" -while ! pg_isready -h "${HAF_HOST}" -p "${HAF_PORT}" -q 2>/dev/null; do +while ! docker-compose -f "${COMPOSE_FILE}" exec -T haf pg_isready -U haf_admin -d haf_block_log 2>/dev/null; do sleep 5 WAITED=$((WAITED + 5)) if [[ $WAITED -ge $TIMEOUT ]]; then echo "ERROR: HAF not ready after ${TIMEOUT}s" + echo "" + echo "Container logs:" + docker-compose -f "${COMPOSE_FILE}" logs haf | tail -50 exit 1 fi echo "Waiting for HAF... (${WAITED}s)" @@ -47,17 +48,16 @@ done echo "HAF ready after ${WAITED}s" echo "" -# Wait for PostgREST +# Wait for PostgREST - in DinD, exposed ports are available at 'docker' host echo "--- Waiting for PostgREST ---" -while ! curl -sf "http://${POSTGREST_HOST}:${POSTGREST_ADMIN_PORT}/ready" >/dev/null 2>&1; do +while ! curl -sf "http://${POSTGREST_HOST}:${POSTGREST_PORT}/" >/dev/null 2>&1; do sleep 5 WAITED=$((WAITED + 5)) if [[ $WAITED -ge $TIMEOUT ]]; then echo "ERROR: PostgREST not ready after ${TIMEOUT}s" echo "" - echo "Debug info:" - echo " curl -v http://${POSTGREST_HOST}:${POSTGREST_ADMIN_PORT}/ready" - echo " docker compose logs postgrest" + echo "Container logs:" + docker-compose -f "${COMPOSE_FILE}" logs postgrest | tail -50 exit 1 fi echo "Waiting for PostgREST... (${WAITED}s)" @@ -65,14 +65,5 @@ done echo "PostgREST ready after ${WAITED}s" echo "" -# Verify API is responding -echo "--- Verifying API ---" -if curl -sf "http://${POSTGREST_HOST}:${POSTGREST_PORT}/" >/dev/null 2>&1; then - echo "API responding at http://${POSTGREST_HOST}:${POSTGREST_PORT}/" -else - echo "WARNING: API root not responding, but admin endpoint is ready" -fi - -echo "" echo "=== All Services Ready ===" exit 0 -- GitLab From 57f5cc2559767c7aae6d2b494ee12583bf974bbe Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 23:08:11 -0500 Subject: [PATCH 087/108] Fix shm volume: use docker volume instead of bind mount In DinD environments, bind mounts to paths created by the CI runner container are not visible to the dind daemon. Since we run with --skip-hived, we don't need persistent shm storage anyway. Use a simple named docker volume for haf_shmdir instead of a bind mount. --- .gitlab-ci.yml | 9 ++++----- docker/docker-compose-test.yml | 7 ++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a512fde7..78c389f6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -677,9 +677,8 @@ python_api_client_test: # Docker-in-docker connection (disable TLS for simplicity) DOCKER_TLS_CERTDIR: "" DOCKER_HOST: "tcp://docker:2375" - # Test data directories - extracted from sync cache + # Test data directory - extracted from sync cache HAF_DATA_DIRECTORY: "${CI_PROJECT_DIR}/test-data" - HAF_SHM_DIRECTORY: "${CI_PROJECT_DIR}/test-shm" COMPOSE_FILE: "docker/docker-compose-test.yml" # Service hostnames - use 'docker' since that's the dind service where containers run HAF_HOST: "docker" @@ -720,8 +719,8 @@ python_api_client_test: echo "docker-compose path: $(command -v docker-compose)" # Fetch cache-manager - !reference [.fetch_cache_manager, before_script] - # Create data directories - - mkdir -p "${HAF_DATA_DIRECTORY}" "${HAF_SHM_DIRECTORY}" + # Create data directory + - mkdir -p "${HAF_DATA_DIRECTORY}" # Extract sync cache - | echo -e "\e[0Ksection_start:$(date +%s):extract_cache[collapsed=true]\r\e[0KExtracting sync cache..." @@ -751,7 +750,7 @@ python_api_client_test: docker-compose -f "${COMPOSE_FILE:-docker/docker-compose-test.yml}" logs > docker/container-logs.txt 2>&1 || true docker-compose -f "${COMPOSE_FILE:-docker/docker-compose-test.yml}" down -v || true # Cleanup test data - sudo rm -rf "${HAF_DATA_DIRECTORY:-/tmp/test-data}" "${HAF_SHM_DIRECTORY:-/tmp/test-shm}" || true + sudo rm -rf "${HAF_DATA_DIRECTORY:-/tmp/test-data}" || true echo -e "\e[0Ksection_end:$(date +%s):compose_down\r\e[0K" artifacts: paths: diff --git a/docker/docker-compose-test.yml b/docker/docker-compose-test.yml index d4ba7c83..c90b6d3a 100644 --- a/docker/docker-compose-test.yml +++ b/docker/docker-compose-test.yml @@ -90,9 +90,6 @@ volumes: type: none # Cache structure: HAF_DATA_DIRECTORY/datadir/haf_db_store/pgdata device: ${HAF_DATA_DIRECTORY:-/tmp/haf_data}/datadir + # SHM directory - use simple named volume (not bind mount) + # In test mode with --skip-hived, we don't need persistent shm haf_shmdir: - driver: local - driver_opts: - o: bind - type: none - device: ${HAF_SHM_DIRECTORY:-/tmp/haf_shm} -- GitLab From f1250f4e26485d523c58e39eddd29fad478daf40 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Tue, 30 Dec 2025 23:34:26 -0500 Subject: [PATCH 088/108] Add connect timeout to curl in wait script Prevents curl from hanging for 2+ minutes when docker hostname is temporarily unreachable. Using 5 second connect timeout. --- scripts/ci-helpers/wait-for-postgrest.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci-helpers/wait-for-postgrest.sh b/scripts/ci-helpers/wait-for-postgrest.sh index b180e944..bd738c99 100755 --- a/scripts/ci-helpers/wait-for-postgrest.sh +++ b/scripts/ci-helpers/wait-for-postgrest.sh @@ -50,7 +50,7 @@ echo "" # Wait for PostgREST - in DinD, exposed ports are available at 'docker' host echo "--- Waiting for PostgREST ---" -while ! curl -sf "http://${POSTGREST_HOST}:${POSTGREST_PORT}/" >/dev/null 2>&1; do +while ! curl -sf --connect-timeout 5 "http://${POSTGREST_HOST}:${POSTGREST_PORT}/" >/dev/null 2>&1; do sleep 5 WAITED=$((WAITED + 5)) if [[ $WAITED -ge $TIMEOUT ]]; then -- GitLab From 270d32b0acfe545983ecc88fa495b24d114b94a7 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 00:22:58 -0500 Subject: [PATCH 089/108] Install docker CLI in test jobs instead of Python in ci-runner Fix test job failures by using the ci-runner image (Alpine) which has the necessary tools (postgresql16-client, py3-psycopg2, curl) instead of overriding to ci-base-image (Ubuntu) which lacks these packages. Changes: - docker/ci/Dockerfile: Add curl and py3-psycopg2 to ci-runner - .gitlab-ci.yml: Remove image overrides from regression-test, performance-test, and pattern-test so they use ci-runner This fixes: - psql: command not found (regression-test) - ModuleNotFoundError: No module named 'psycopg2' (performance-test) - curl connectivity issues in DinD --- .gitlab-ci.yml | 3 --- docker/ci/Dockerfile | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 78c389f6..9b74c3ed 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -767,7 +767,6 @@ python_api_client_test: regression-test: extends: .hafbe_test_base - image: $BUILDER_IMAGE_PATH script: - | echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning regression tests..." @@ -799,7 +798,6 @@ setup-scripts-test: performance-test: extends: .hafbe_test_base - image: $BUILDER_IMAGE_PATH script: - | echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning performance tests..." @@ -824,7 +822,6 @@ pattern-test: extends: - .hafbe_test_base - .pytest_based_template - image: $BUILDER_IMAGE_PATH variables: JUNIT_REPORT: $CI_PROJECT_DIR/tests/tavern/report.xml POETRY_INSTALL_ROOT_DIR: $CI_PROJECT_DIR/submodules/haf/hive/tests/python/hive-local-tools diff --git a/docker/ci/Dockerfile b/docker/ci/Dockerfile index 84acd82c..29429d09 100644 --- a/docker/ci/Dockerfile +++ b/docker/ci/Dockerfile @@ -7,7 +7,9 @@ RUN <<-EOF # Install additional CI dependencies apk add --no-cache \ 7zip \ + curl \ postgresql16-client \ + py3-psycopg2 \ sudo EOF -- GitLab From db32843795e278280118bedc6f6769533a523b05 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 00:51:05 -0500 Subject: [PATCH 090/108] Fix DinD networking: use docker-compose exec for PostgREST check In Docker-in-Docker environments, external curl from the CI job container cannot reach containers running inside the dind service. Changed PostgREST readiness check to use docker-compose exec to run curl inside the postgrest container. Also added explicit image to pattern-test to override the .pytest_based_template which sets docker:latest. --- .gitlab-ci.yml | 2 ++ scripts/ci-helpers/wait-for-postgrest.sh | 12 ++++-------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9b74c3ed..251c0235 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -822,6 +822,8 @@ pattern-test: extends: - .hafbe_test_base - .pytest_based_template + # Explicit image needed to override .pytest_based_template (which sets docker:latest) + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 variables: JUNIT_REPORT: $CI_PROJECT_DIR/tests/tavern/report.xml POETRY_INSTALL_ROOT_DIR: $CI_PROJECT_DIR/submodules/haf/hive/tests/python/hive-local-tools diff --git a/scripts/ci-helpers/wait-for-postgrest.sh b/scripts/ci-helpers/wait-for-postgrest.sh index bd738c99..f00957ab 100755 --- a/scripts/ci-helpers/wait-for-postgrest.sh +++ b/scripts/ci-helpers/wait-for-postgrest.sh @@ -7,25 +7,21 @@ # # In DinD (Docker-in-Docker) environments: # - HAF check uses 'docker-compose exec' to run pg_isready inside the container -# - PostgREST check uses 'docker' hostname (the dind service host) +# - PostgREST check uses 'docker-compose exec' to run curl inside the container +# (external curl from CI job cannot reach containers running inside dind) # # Environment variables: # COMPOSE_FILE - Docker compose file path (required) -# POSTGREST_HOST - Host for PostgREST HTTP checks (default: docker) -# POSTGREST_PORT - PostgREST port (default: 3000) # WAIT_TIMEOUT - Total timeout in seconds (default: 300) set -euo pipefail # Configuration COMPOSE_FILE="${COMPOSE_FILE:?COMPOSE_FILE must be set}" -POSTGREST_HOST="${POSTGREST_HOST:-docker}" -POSTGREST_PORT="${POSTGREST_PORT:-3000}" TIMEOUT="${WAIT_TIMEOUT:-300}" echo "=== Waiting for Test Services ===" echo "Compose file: ${COMPOSE_FILE}" -echo "PostgREST: http://${POSTGREST_HOST}:${POSTGREST_PORT}" echo "Timeout: ${TIMEOUT}s" echo "" @@ -48,9 +44,9 @@ done echo "HAF ready after ${WAITED}s" echo "" -# Wait for PostgREST - in DinD, exposed ports are available at 'docker' host +# Wait for PostgREST using docker-compose exec (in DinD, external curl can't reach containers) echo "--- Waiting for PostgREST ---" -while ! curl -sf --connect-timeout 5 "http://${POSTGREST_HOST}:${POSTGREST_PORT}/" >/dev/null 2>&1; do +while ! docker-compose -f "${COMPOSE_FILE}" exec -T postgrest curl -sf http://localhost:3000/ >/dev/null 2>&1; do sleep 5 WAITED=$((WAITED + 5)) if [[ $WAITED -ge $TIMEOUT ]]; then -- GitLab From 9ad8a7223f839ea90a00849499398eaee0395f72 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 01:14:22 -0500 Subject: [PATCH 091/108] Fix PostgREST wait: check container health instead of exec curl The postgrest/postgrest image is minimal and may not have curl available for docker-compose exec. Changed to check container health status via 'docker-compose ps' output instead. The container's own healthcheck (defined in docker-compose-test.yml) runs inside the container and will report healthy status. --- scripts/ci-helpers/wait-for-postgrest.sh | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/scripts/ci-helpers/wait-for-postgrest.sh b/scripts/ci-helpers/wait-for-postgrest.sh index f00957ab..89c7e099 100755 --- a/scripts/ci-helpers/wait-for-postgrest.sh +++ b/scripts/ci-helpers/wait-for-postgrest.sh @@ -7,8 +7,8 @@ # # In DinD (Docker-in-Docker) environments: # - HAF check uses 'docker-compose exec' to run pg_isready inside the container -# - PostgREST check uses 'docker-compose exec' to run curl inside the container -# (external curl from CI job cannot reach containers running inside dind) +# - PostgREST check uses 'docker-compose ps' to check container health status +# (the postgrest image is minimal and has no curl, but the healthcheck works) # # Environment variables: # COMPOSE_FILE - Docker compose file path (required) @@ -44,14 +44,23 @@ done echo "HAF ready after ${WAITED}s" echo "" -# Wait for PostgREST using docker-compose exec (in DinD, external curl can't reach containers) +# Wait for PostgREST by checking container health status +# Use docker-compose ps output which shows (healthy) status echo "--- Waiting for PostgREST ---" -while ! docker-compose -f "${COMPOSE_FILE}" exec -T postgrest curl -sf http://localhost:3000/ >/dev/null 2>&1; do +while true; do + # Check if container shows as healthy in docker-compose ps output + PS_OUTPUT=$(docker-compose -f "${COMPOSE_FILE}" ps postgrest 2>/dev/null || echo "") + if echo "$PS_OUTPUT" | grep -q "(healthy)"; then + break + fi sleep 5 WAITED=$((WAITED + 5)) if [[ $WAITED -ge $TIMEOUT ]]; then echo "ERROR: PostgREST not ready after ${TIMEOUT}s" echo "" + echo "Container status:" + docker-compose -f "${COMPOSE_FILE}" ps postgrest 2>/dev/null || true + echo "" echo "Container logs:" docker-compose -f "${COMPOSE_FILE}" logs postgrest | tail -50 exit 1 -- GitLab From 279a3b35f25ad35fea28cbc0b7afc8bd4c695a3f Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 01:33:15 -0500 Subject: [PATCH 092/108] Fix PostgREST healthcheck: use wget instead of curl The postgrest/postgrest:latest image is minimal and doesn't include curl. Changed healthcheck to use wget which is available in the Alpine-based image. --- docker/docker-compose-test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/docker-compose-test.yml b/docker/docker-compose-test.yml index c90b6d3a..cd8bc6f8 100644 --- a/docker/docker-compose-test.yml +++ b/docker/docker-compose-test.yml @@ -72,7 +72,8 @@ services: - "3000:3000" - "3001:3001" healthcheck: - test: ["CMD-SHELL", "curl -sf http://localhost:3001/ready || exit 1"] + # Use wget instead of curl - postgrest image is minimal and doesn't have curl + test: ["CMD-SHELL", "wget -q --spider http://localhost:3001/ready || exit 1"] interval: 5s timeout: 3s retries: 30 -- GitLab From 2c7bfd7ecdb4857570ae900c91a181b89a3494a3 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 02:04:02 -0500 Subject: [PATCH 093/108] Fix PostgREST check: use HAF container to curl PostgREST The postgrest/postgrest image is minimal Alpine with no curl or wget. Changed to use the HAF container (Ubuntu-based with curl) to reach PostgREST via the docker network at http://postgrest:3000/. Removed the broken healthcheck from docker-compose-test.yml since the wait script handles the readiness check. --- docker/docker-compose-test.yml | 9 ++------- scripts/ci-helpers/wait-for-postgrest.sh | 17 ++++++----------- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/docker/docker-compose-test.yml b/docker/docker-compose-test.yml index cd8bc6f8..d22813d0 100644 --- a/docker/docker-compose-test.yml +++ b/docker/docker-compose-test.yml @@ -71,13 +71,8 @@ services: ports: - "3000:3000" - "3001:3001" - healthcheck: - # Use wget instead of curl - postgrest image is minimal and doesn't have curl - test: ["CMD-SHELL", "wget -q --spider http://localhost:3001/ready || exit 1"] - interval: 5s - timeout: 3s - retries: 30 - start_period: 30s + # Note: The postgrest image is minimal and has no curl/wget, so we can't use healthcheck. + # The wait-for-postgrest.sh script uses the HAF container to curl PostgREST instead. networks: haf-network-test: diff --git a/scripts/ci-helpers/wait-for-postgrest.sh b/scripts/ci-helpers/wait-for-postgrest.sh index 89c7e099..69dc5b78 100755 --- a/scripts/ci-helpers/wait-for-postgrest.sh +++ b/scripts/ci-helpers/wait-for-postgrest.sh @@ -7,8 +7,8 @@ # # In DinD (Docker-in-Docker) environments: # - HAF check uses 'docker-compose exec' to run pg_isready inside the container -# - PostgREST check uses 'docker-compose ps' to check container health status -# (the postgrest image is minimal and has no curl, but the healthcheck works) +# - PostgREST check uses 'docker-compose exec haf curl' to reach PostgREST +# (the postgrest image is minimal and has no curl/wget, so we use the HAF container) # # Environment variables: # COMPOSE_FILE - Docker compose file path (required) @@ -44,22 +44,17 @@ done echo "HAF ready after ${WAITED}s" echo "" -# Wait for PostgREST by checking container health status -# Use docker-compose ps output which shows (healthy) status +# Wait for PostgREST by using HAF container to curl the API +# The HAF container (Ubuntu-based) has curl and can reach postgrest via docker network echo "--- Waiting for PostgREST ---" -while true; do - # Check if container shows as healthy in docker-compose ps output - PS_OUTPUT=$(docker-compose -f "${COMPOSE_FILE}" ps postgrest 2>/dev/null || echo "") - if echo "$PS_OUTPUT" | grep -q "(healthy)"; then - break - fi +while ! docker-compose -f "${COMPOSE_FILE}" exec -T haf curl -sf http://postgrest:3000/ >/dev/null 2>&1; do sleep 5 WAITED=$((WAITED + 5)) if [[ $WAITED -ge $TIMEOUT ]]; then echo "ERROR: PostgREST not ready after ${TIMEOUT}s" echo "" echo "Container status:" - docker-compose -f "${COMPOSE_FILE}" ps postgrest 2>/dev/null || true + docker-compose -f "${COMPOSE_FILE}" ps 2>/dev/null || true echo "" echo "Container logs:" docker-compose -f "${COMPOSE_FILE}" logs postgrest | tail -50 -- GitLab From 0766595dc564fca4dbe6a2e036187a1d8281fa0c Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 02:23:44 -0500 Subject: [PATCH 094/108] Fix PostgREST check: use bash /dev/tcp instead of curl Both HAF and PostgREST containers are minimal and don't have curl/wget. Use bash's built-in /dev/tcp feature to check if the PostgREST port is accepting connections. --- scripts/ci-helpers/wait-for-postgrest.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/ci-helpers/wait-for-postgrest.sh b/scripts/ci-helpers/wait-for-postgrest.sh index 69dc5b78..d67f5019 100755 --- a/scripts/ci-helpers/wait-for-postgrest.sh +++ b/scripts/ci-helpers/wait-for-postgrest.sh @@ -7,8 +7,8 @@ # # In DinD (Docker-in-Docker) environments: # - HAF check uses 'docker-compose exec' to run pg_isready inside the container -# - PostgREST check uses 'docker-compose exec haf curl' to reach PostgREST -# (the postgrest image is minimal and has no curl/wget, so we use the HAF container) +# - PostgREST check uses 'docker-compose exec haf bash -c /dev/tcp/...' to verify port +# (uses bash built-in TCP feature, no external tools like curl/wget needed) # # Environment variables: # COMPOSE_FILE - Docker compose file path (required) @@ -44,10 +44,10 @@ done echo "HAF ready after ${WAITED}s" echo "" -# Wait for PostgREST by using HAF container to curl the API -# The HAF container (Ubuntu-based) has curl and can reach postgrest via docker network +# Wait for PostgREST by checking if port 3000 is open from HAF container +# Using bash /dev/tcp which doesn't require external tools (curl/wget/nc) echo "--- Waiting for PostgREST ---" -while ! docker-compose -f "${COMPOSE_FILE}" exec -T haf curl -sf http://postgrest:3000/ >/dev/null 2>&1; do +while ! docker-compose -f "${COMPOSE_FILE}" exec -T haf bash -c 'echo > /dev/tcp/postgrest/3000' 2>/dev/null; do sleep 5 WAITED=$((WAITED + 5)) if [[ $WAITED -ge $TIMEOUT ]]; then -- GitLab From 0a36ae1aa618ab39e876b1313e44733c83ab223b Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 02:43:05 -0500 Subject: [PATCH 095/108] Add python3 to ci-runner image for test scripts --- docker/ci/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/ci/Dockerfile b/docker/ci/Dockerfile index 29429d09..9fab52e7 100644 --- a/docker/ci/Dockerfile +++ b/docker/ci/Dockerfile @@ -9,6 +9,7 @@ RUN <<-EOF 7zip \ curl \ postgresql16-client \ + python3 \ py3-psycopg2 \ sudo EOF -- GitLab From 63dd43d6244f92d4df8bdb8804d0551ef9dc54fd Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 03:37:17 -0500 Subject: [PATCH 096/108] Bump ci-runner tag to force rebuild with python3 --- .gitlab-ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 251c0235..b209d8dd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -375,7 +375,7 @@ docker-ci-runner-build: - when: on_success variables: BASE_REPO_NAME: "" - BASE_TAG: "docker-26.1.4-1" + BASE_TAG: "docker-26.1.4-2" NAME: "ci-runner" TARGET: "ci-runner-ci" # Only builds Dockerfile, doesn't need submodules @@ -476,7 +476,7 @@ build_python_api_client_wheel: sync: extends: .docker_image_builder_job_template stage: sync - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-2 needs: - job: quick_test_setup artifacts: true @@ -658,7 +658,7 @@ python_api_client_test: .hafbe_test_base: extends: .docker_image_builder_job_template stage: test - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-2 needs: - job: quick_test_setup artifacts: true @@ -823,7 +823,7 @@ pattern-test: - .hafbe_test_base - .pytest_based_template # Explicit image needed to override .pytest_based_template (which sets docker:latest) - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-1 + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-2 variables: JUNIT_REPORT: $CI_PROJECT_DIR/tests/tavern/report.xml POETRY_INSTALL_ROOT_DIR: $CI_PROJECT_DIR/submodules/haf/hive/tests/python/hive-local-tools -- GitLab From af67e0017c063e2b597e69175c0eb63bf67c5173 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 04:14:35 -0500 Subject: [PATCH 097/108] Add py3-pip to ci-runner and bump tag to docker-26.1.4-3 Test jobs need pip3 to install poetry. --- .gitlab-ci.yml | 8 ++++---- docker/ci/Dockerfile | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b209d8dd..00d78d6c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -375,7 +375,7 @@ docker-ci-runner-build: - when: on_success variables: BASE_REPO_NAME: "" - BASE_TAG: "docker-26.1.4-2" + BASE_TAG: "docker-26.1.4-3" NAME: "ci-runner" TARGET: "ci-runner-ci" # Only builds Dockerfile, doesn't need submodules @@ -476,7 +476,7 @@ build_python_api_client_wheel: sync: extends: .docker_image_builder_job_template stage: sync - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-2 + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-3 needs: - job: quick_test_setup artifacts: true @@ -658,7 +658,7 @@ python_api_client_test: .hafbe_test_base: extends: .docker_image_builder_job_template stage: test - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-2 + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-3 needs: - job: quick_test_setup artifacts: true @@ -823,7 +823,7 @@ pattern-test: - .hafbe_test_base - .pytest_based_template # Explicit image needed to override .pytest_based_template (which sets docker:latest) - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-2 + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-3 variables: JUNIT_REPORT: $CI_PROJECT_DIR/tests/tavern/report.xml POETRY_INSTALL_ROOT_DIR: $CI_PROJECT_DIR/submodules/haf/hive/tests/python/hive-local-tools diff --git a/docker/ci/Dockerfile b/docker/ci/Dockerfile index 9fab52e7..b1c8eb0c 100644 --- a/docker/ci/Dockerfile +++ b/docker/ci/Dockerfile @@ -10,6 +10,7 @@ RUN <<-EOF curl \ postgresql16-client \ python3 \ + py3-pip \ py3-psycopg2 \ sudo EOF -- GitLab From 413c20f4f045abc9d3ae4df5d5bdd218ebcf6d50 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 04:51:38 -0500 Subject: [PATCH 098/108] Add JMeter/m2u and PATH fix for poetry - Copy JMeter and m2u from benchmark-test-runner - Add openjdk11-jre for JMeter - Add PATH export for poetry (pip installs to ~/.local/bin) - Bump ci-runner tag to docker-26.1.4-4 --- .gitlab-ci.yml | 10 ++++++---- docker/ci/Dockerfile | 11 +++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 00d78d6c..78e77a78 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -375,7 +375,7 @@ docker-ci-runner-build: - when: on_success variables: BASE_REPO_NAME: "" - BASE_TAG: "docker-26.1.4-3" + BASE_TAG: "docker-26.1.4-4" NAME: "ci-runner" TARGET: "ci-runner-ci" # Only builds Dockerfile, doesn't need submodules @@ -476,7 +476,7 @@ build_python_api_client_wheel: sync: extends: .docker_image_builder_job_template stage: sync - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-3 + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-4 needs: - job: quick_test_setup artifacts: true @@ -658,7 +658,7 @@ python_api_client_test: .hafbe_test_base: extends: .docker_image_builder_job_template stage: test - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-3 + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-4 needs: - job: quick_test_setup artifacts: true @@ -823,7 +823,7 @@ pattern-test: - .hafbe_test_base - .pytest_based_template # Explicit image needed to override .pytest_based_template (which sets docker:latest) - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-3 + image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-4 variables: JUNIT_REPORT: $CI_PROJECT_DIR/tests/tavern/report.xml POETRY_INSTALL_ROOT_DIR: $CI_PROJECT_DIR/submodules/haf/hive/tests/python/hive-local-tools @@ -834,6 +834,8 @@ pattern-test: - !reference [.hafbe_test_base, before_script] # Install poetry for pytest template (Alpine packages) - pip3 install --break-system-packages poetry + # Add pip user bin to PATH (poetry installs here) + - export PATH="$HOME/.local/bin:$PATH" - !reference [.pytest_based_template, before_script] script: - | diff --git a/docker/ci/Dockerfile b/docker/ci/Dockerfile index b1c8eb0c..62088f02 100644 --- a/docker/ci/Dockerfile +++ b/docker/ci/Dockerfile @@ -2,17 +2,28 @@ # CI runner image based on common-ci-configuration docker-builder FROM registry.gitlab.syncad.com/hive/common-ci-configuration/docker-builder:latest +# Copy JMeter and m2u from benchmark-test-runner +COPY --from=registry.gitlab.syncad.com/hive/common-ci-configuration/benchmark-test-runner:latest \ + --link /opt/tools/jmeter /opt/tools/jmeter +COPY --from=registry.gitlab.syncad.com/hive/common-ci-configuration/benchmark-test-runner:latest \ + --link /opt/tools/m2u /opt/tools/m2u + USER root RUN <<-EOF # Install additional CI dependencies apk add --no-cache \ 7zip \ curl \ + openjdk11-jre \ postgresql16-client \ python3 \ py3-pip \ py3-psycopg2 \ sudo + + # Create symlinks for JMeter and m2u + ln -s /opt/tools/jmeter/bin/jmeter.sh /usr/bin/jmeter + ln -s /opt/tools/m2u/m2u /usr/bin/m2u EOF # Create hived user for compatibility with existing scripts (if not exists) -- GitLab From 5a9b945662faa81c3978af296ab171ea66f85538 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 06:58:42 -0500 Subject: [PATCH 099/108] Migrate to shared haf-app-test-runner image from common-ci-configuration - Use haf-app-test-runner:feature-haf-app-test-runner (Python 3.14, Poetry, Docker, JMeter) - Remove docker-ci-runner-build job (no longer needed) - Remove docker/ci/Dockerfile (using shared image) - Simplify .hafbe_test_base before_script (docker/compose pre-installed) - Remove pip install poetry from pattern-test (pre-installed in image) This eliminates per-project CI image build overhead and provides consistent test environment across all HAF applications. --- .gitlab-ci.yml | 52 +++++++++----------------------------------- docker/ci/Dockerfile | 33 ---------------------------- 2 files changed, 10 insertions(+), 75 deletions(-) delete mode 100644 docker/ci/Dockerfile diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 78e77a78..9571e853 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -367,20 +367,6 @@ prepare_haf_data: tags: - public-runner-docker -docker-ci-runner-build: - extends: .docker-base-build-template - rules: - - if: $DOCS_ONLY == "true" - when: never - - when: on_success - variables: - BASE_REPO_NAME: "" - BASE_TAG: "docker-26.1.4-4" - NAME: "ci-runner" - TARGET: "ci-runner-ci" - # Only builds Dockerfile, doesn't need submodules - GIT_SUBMODULE_STRATEGY: none - docker-setup-docker-image-build: extends: .docker-base-build-template rules: @@ -476,7 +462,8 @@ build_python_api_client_wheel: sync: extends: .docker_image_builder_job_template stage: sync - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-4 + # Shared HAF app test runner from common-ci-configuration + image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:feature-haf-app-test-runner needs: - job: quick_test_setup artifacts: true @@ -486,7 +473,6 @@ sync: - job: prepare_haf_data optional: true # Skipped in QUICK_TEST mode - job: docker-setup-docker-image-build - - job: docker-ci-runner-build rules: # Skip for docs-only changes - if: $DOCS_ONLY == "true" @@ -658,14 +644,15 @@ python_api_client_test: .hafbe_test_base: extends: .docker_image_builder_job_template stage: test - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-4 + # Shared HAF app test runner from common-ci-configuration + # Has: Python 3.14, Poetry, Docker CLI, docker-compose, JMeter, m2u + image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:feature-haf-app-test-runner needs: - job: quick_test_setup artifacts: true optional: true - job: sync artifacts: true - - job: docker-ci-runner-build - job: prepare_haf_image artifacts: true optional: true @@ -693,30 +680,14 @@ python_api_client_test: before_script: # Git setup (from docker_image_builder_job_template, without buildx) - git config --global --add safe.directory '*' - # Install docker and docker-compose if not available (use $LOCAL_BIN for non-root images) + # Create docker-compose wrapper for 'docker compose' plugin (haf-app-test-runner has docker-compose-v2) - | mkdir -p "$LOCAL_BIN" export PATH="$LOCAL_BIN:$PATH" - if ! command -v docker &> /dev/null; then - echo "Installing Docker CLI to $LOCAL_BIN..." - DOCKER_VERSION="27.3.1" - curl -fsSL "https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz" | tar xz -C /tmp - mv /tmp/docker/docker "$LOCAL_BIN/" - fi - # Ensure docker-compose command is available (standalone or wrapper for plugin) if ! command -v docker-compose &> /dev/null; then - if docker compose version &> /dev/null; then - echo "Creating docker-compose wrapper for docker compose plugin..." - printf '#!/bin/sh\nexec docker compose "$@"\n' > "$LOCAL_BIN/docker-compose" - chmod +x "$LOCAL_BIN/docker-compose" - else - echo "Installing docker-compose standalone to $LOCAL_BIN..." - COMPOSE_VERSION="v2.30.3" - curl -fsSL "https://github.com/docker/compose/releases/download/${COMPOSE_VERSION}/docker-compose-linux-x86_64" -o "$LOCAL_BIN/docker-compose" - chmod +x "$LOCAL_BIN/docker-compose" - fi + printf '#!/bin/sh\nexec docker compose "$@"\n' > "$LOCAL_BIN/docker-compose" + chmod +x "$LOCAL_BIN/docker-compose" fi - echo "docker-compose path: $(command -v docker-compose)" # Fetch cache-manager - !reference [.fetch_cache_manager, before_script] # Create data directory @@ -823,7 +794,7 @@ pattern-test: - .hafbe_test_base - .pytest_based_template # Explicit image needed to override .pytest_based_template (which sets docker:latest) - image: registry.gitlab.syncad.com/hive/haf_block_explorer/ci-runner:docker-26.1.4-4 + image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:feature-haf-app-test-runner variables: JUNIT_REPORT: $CI_PROJECT_DIR/tests/tavern/report.xml POETRY_INSTALL_ROOT_DIR: $CI_PROJECT_DIR/submodules/haf/hive/tests/python/hive-local-tools @@ -832,10 +803,7 @@ pattern-test: TAVERN_DIR: $CI_PROJECT_DIR/tests/tavern before_script: - !reference [.hafbe_test_base, before_script] - # Install poetry for pytest template (Alpine packages) - - pip3 install --break-system-packages poetry - # Add pip user bin to PATH (poetry installs here) - - export PATH="$HOME/.local/bin:$PATH" + # Poetry is pre-installed in haf-app-test-runner image - !reference [.pytest_based_template, before_script] script: - | diff --git a/docker/ci/Dockerfile b/docker/ci/Dockerfile deleted file mode 100644 index 62088f02..00000000 --- a/docker/ci/Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -# syntax=registry.gitlab.syncad.com/hive/common-ci-configuration/dockerfile:1.11 -# CI runner image based on common-ci-configuration docker-builder -FROM registry.gitlab.syncad.com/hive/common-ci-configuration/docker-builder:latest - -# Copy JMeter and m2u from benchmark-test-runner -COPY --from=registry.gitlab.syncad.com/hive/common-ci-configuration/benchmark-test-runner:latest \ - --link /opt/tools/jmeter /opt/tools/jmeter -COPY --from=registry.gitlab.syncad.com/hive/common-ci-configuration/benchmark-test-runner:latest \ - --link /opt/tools/m2u /opt/tools/m2u - -USER root -RUN <<-EOF - # Install additional CI dependencies - apk add --no-cache \ - 7zip \ - curl \ - openjdk11-jre \ - postgresql16-client \ - python3 \ - py3-pip \ - py3-psycopg2 \ - sudo - - # Create symlinks for JMeter and m2u - ln -s /opt/tools/jmeter/bin/jmeter.sh /usr/bin/jmeter - ln -s /opt/tools/m2u/m2u /usr/bin/m2u -EOF - -# Create hived user for compatibility with existing scripts (if not exists) -RUN id hived 2>/dev/null || adduser -D -s /bin/bash hived && \ - grep -q "^hived " /etc/sudoers || echo "hived ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers - -USER hived \ No newline at end of file -- GitLab From bfa872d8d67a22995b80ad59d2bf2b892f334136 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 07:07:38 -0500 Subject: [PATCH 100/108] Fix haf-app-test-runner image tag to 2.0 The docker-bake.hcl uses HAF_APP_TEST_RUNNER_VERSION (2.0) not branch slug. --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9571e853..fd1b066f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -463,7 +463,7 @@ sync: extends: .docker_image_builder_job_template stage: sync # Shared HAF app test runner from common-ci-configuration - image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:feature-haf-app-test-runner + image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:2.0 needs: - job: quick_test_setup artifacts: true @@ -646,7 +646,7 @@ python_api_client_test: stage: test # Shared HAF app test runner from common-ci-configuration # Has: Python 3.14, Poetry, Docker CLI, docker-compose, JMeter, m2u - image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:feature-haf-app-test-runner + image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:2.0 needs: - job: quick_test_setup artifacts: true @@ -794,7 +794,7 @@ pattern-test: - .hafbe_test_base - .pytest_based_template # Explicit image needed to override .pytest_based_template (which sets docker:latest) - image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:feature-haf-app-test-runner + image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:2.0 variables: JUNIT_REPORT: $CI_PROJECT_DIR/tests/tavern/report.xml POETRY_INSTALL_ROOT_DIR: $CI_PROJECT_DIR/submodules/haf/hive/tests/python/hive-local-tools -- GitLab From 6b406dcca0236f7b3a5d7ef4fd5666bed97c1abf Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 07:51:16 -0500 Subject: [PATCH 101/108] Add DOCKER_HOST to sync job for DinD connection --- .gitlab-ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index fd1b066f..ed464a78 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -479,6 +479,9 @@ sync: when: never - when: on_success variables: + # Docker-in-docker connection (disable TLS for simplicity) + DOCKER_TLS_CERTDIR: "" + DOCKER_HOST: "tcp://docker:2375" DATA_SOURCE: ${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT} DATADIR: ${CI_PROJECT_DIR}/${CI_JOB_ID}/datadir SHM_DIR: ${CI_PROJECT_DIR}/${CI_JOB_ID}/shm_dir -- GitLab From 20a932fccee03d8e3976bf5666501475a7b2b505 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 13:18:09 -0500 Subject: [PATCH 102/108] Fix app-setup permission: run as root to write to scripts directory --- .gitlab-ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ed464a78..72413a02 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -552,7 +552,8 @@ sync: rm -rf "${CI_PROJECT_DIR}/docker/blockchain"/* cp -a -L "${DATADIR}/blockchain"/* "${CI_PROJECT_DIR}/docker/blockchain/" - "${CI_PROJECT_DIR}/scripts/ci-helpers/start-ci-test-environment.sh" + # Run app-setup as root to allow writing to the container's scripts directory + "${CI_PROJECT_DIR}/scripts/ci-helpers/start-ci-test-environment.sh" --setup-uid=0 echo -e "\e[0Ksection_end:$(date +%s):compose\r\e[0K" echo -e "\e[0Ksection_start:$(date +%s):wait[collapsed=true]\r\e[0KWaiting for HAF BE startup..." -- GitLab From acd28f2f7eaae9e7a61be993bf1765b90d138082 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 13:32:20 -0500 Subject: [PATCH 103/108] Fix after_script permissions for shm_dir copy and cache creation --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 72413a02..f76a5fe1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -579,10 +579,10 @@ sync: popd tar -cf - $(pwd)/docker/*.log | 7z a -si -mx9 docker/container-logs.tar.7z - cp -a "${SHM_DIR}" "${DATADIR}/shm_dir" + sudo cp -a "${SHM_DIR}" "${DATADIR}/shm_dir" LOCAL_HAFBE_CACHE="${DATA_CACHE_HAF_PREFIX}_${HAFBE_CACHE_KEY}" - mkdir -p "${LOCAL_HAFBE_CACHE}" + sudo mkdir -p "${LOCAL_HAFBE_CACHE}" sudo cp -a "${DATADIR}" "${LOCAL_HAFBE_CACHE}" ls -lah "${DATADIR}" -- GitLab From 44606b9d7be77618bc77bca3dc90ee292e98ac70 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 14:20:23 -0500 Subject: [PATCH 104/108] Update haf-app-test-runner to v2.1 (adds psycopg2-binary) --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f76a5fe1..cf7d68b6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -463,7 +463,7 @@ sync: extends: .docker_image_builder_job_template stage: sync # Shared HAF app test runner from common-ci-configuration - image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:2.0 + image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:2.1 needs: - job: quick_test_setup artifacts: true @@ -650,7 +650,7 @@ python_api_client_test: stage: test # Shared HAF app test runner from common-ci-configuration # Has: Python 3.14, Poetry, Docker CLI, docker-compose, JMeter, m2u - image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:2.0 + image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:2.1 needs: - job: quick_test_setup artifacts: true @@ -798,7 +798,7 @@ pattern-test: - .hafbe_test_base - .pytest_based_template # Explicit image needed to override .pytest_based_template (which sets docker:latest) - image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:2.0 + image: registry.gitlab.syncad.com/hive/common-ci-configuration/haf-app-test-runner:2.1 variables: JUNIT_REPORT: $CI_PROJECT_DIR/tests/tavern/report.xml POETRY_INSTALL_ROOT_DIR: $CI_PROJECT_DIR/submodules/haf/hive/tests/python/hive-local-tools -- GitLab From 115bff226b21f4fe4024c4783a5e8d7ff9cc96cb Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 17:42:36 -0500 Subject: [PATCH 105/108] Add debug curl to capture get_witness_votes_history error response --- .gitlab-ci.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index cf7d68b6..139c95f5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -811,6 +811,19 @@ pattern-test: - !reference [.pytest_based_template, before_script] script: - | + echo -e "\e[0Ksection_start:$(date +%s):debug\r\e[0KDebug: Testing failing endpoint..." + + # Debug the failing get_witness_votes_history endpoint + echo "=== Debug: get_witness_votes_history with wefdi filter ===" + curl -s -X POST "http://${POSTGREST_HOST}:3000/rpc/get_witness_votes_history" \ + -H "Content-Type: application/json" \ + -d '{"account-name":"blocktrades","voter-name":"wefdi","direction":"desc","page-size":20}' \ + | tee /tmp/debug-witness-votes-history.json || true + echo "" + echo "=== Response above ===" + + echo -e "\e[0Ksection_end:$(date +%s):debug\r\e[0K" + echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning Tavern API tests..." cd $CI_PROJECT_DIR/tests/tavern -- GitLab From 4817369519bc47c151001b6debf55419b17700bd Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 18:03:10 -0500 Subject: [PATCH 106/108] Add database warmup step to prevent statement timeout in pattern tests --- .gitlab-ci.yml | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 139c95f5..2a9bfc2c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -811,18 +811,15 @@ pattern-test: - !reference [.pytest_based_template, before_script] script: - | - echo -e "\e[0Ksection_start:$(date +%s):debug\r\e[0KDebug: Testing failing endpoint..." + echo -e "\e[0Ksection_start:$(date +%s):warmup[collapsed=true]\r\e[0KWarming up database..." - # Debug the failing get_witness_votes_history endpoint - echo "=== Debug: get_witness_votes_history with wefdi filter ===" - curl -s -X POST "http://${POSTGREST_HOST}:3000/rpc/get_witness_votes_history" \ + # Warmup: Run a complex query to warm PostgreSQL caches + # This helps avoid statement timeouts on cold database (hafbe_user has 10s timeout) + curl -s --max-time 15 -X POST "http://${POSTGREST_HOST}:3000/rpc/get_witness_votes_history" \ -H "Content-Type: application/json" \ - -d '{"account-name":"blocktrades","voter-name":"wefdi","direction":"desc","page-size":20}' \ - | tee /tmp/debug-witness-votes-history.json || true - echo "" - echo "=== Response above ===" + -d '{"account-name":"blocktrades","page-size":5}' > /dev/null 2>&1 || true - echo -e "\e[0Ksection_end:$(date +%s):debug\r\e[0K" + echo -e "\e[0Ksection_end:$(date +%s):warmup\r\e[0K" echo -e "\e[0Ksection_start:$(date +%s):tests\r\e[0KRunning Tavern API tests..." -- GitLab From 5d6885a206aa7d023aec47f13f571aa1cbaa29bf Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Wed, 31 Dec 2025 19:19:27 -0500 Subject: [PATCH 107/108] Fix sync job: set HAF_REGISTRY_TAG from HAF_COMMIT When prepare_haf_image is skipped (e.g., docs-only changes in merge commit), the sync job needs HAF_REGISTRY_TAG to be explicitly set, otherwise it falls back to an outdated default value in docker-compose.yml. --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2a9bfc2c..fcef3188 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -488,6 +488,8 @@ sync: HAF_DATA_DIRECTORY: ${DATADIR} HAF_SHM_DIRECTORY: ${SHM_DIR} BACKEND_VERSION: "$CI_COMMIT_SHORT_SHA" + # HAF image tag - use HAF_COMMIT to ensure correct image when prepare_haf_image is skipped + HAF_REGISTRY_TAG: "$HAF_COMMIT" POSTGRES_ACCESS: postgresql://haf_admin@docker:5432/haf_block_log COMPOSE_OPTIONS_STRING: --env-file ci.env --file docker-compose.yml --file overrides/ci.yml --ansi never -- GitLab From 8344a5317824327067bd4a0e100eb60611e90095 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Thu, 1 Jan 2026 16:21:07 -0500 Subject: [PATCH 108/108] Add missing index for voter filtering in witness votes history The get_witness_votes_history endpoint was timing out when filtering by voter-name because there was no index on voter_id. The existing index only covered (witness_id, source_op_block). This adds an index on (witness_id, voter_id) to enable efficient lookups when the voter-name filter is used. --- database/indexes/hafbe_app_indexes.sql | 3 +++ 1 file changed, 3 insertions(+) diff --git a/database/indexes/hafbe_app_indexes.sql b/database/indexes/hafbe_app_indexes.sql index 6fa5ab09..94ee482b 100644 --- a/database/indexes/hafbe_app_indexes.sql +++ b/database/indexes/hafbe_app_indexes.sql @@ -15,6 +15,9 @@ BEGIN --Can only vote once every 3 seconds, so sorting by block_num is sufficient CREATE INDEX IF NOT EXISTS witness_votes_history_witness_id_source_op ON hafbe_app.witness_votes_history USING btree (witness_id, hafd.operation_id_to_block_num( source_op )); + -- Index for efficient voter filtering in get_witness_votes_history endpoint + CREATE INDEX IF NOT EXISTS witness_votes_history_witness_voter ON hafbe_app.witness_votes_history USING btree (witness_id, voter_id); + CREATE INDEX IF NOT EXISTS account_proxies_history_account_id_source_op ON hafbe_app.account_proxies_history USING btree (account_id, source_op); CREATE INDEX IF NOT EXISTS account_proxies_history_account_id ON hafbe_app.account_proxies_history USING btree (account_id); CREATE INDEX IF NOT EXISTS current_account_proxies_proxy_id ON hafbe_app.current_account_proxies USING btree (proxy_id); -- GitLab