From 9f1d9482ed8c64dacb5e13e48c0fc53c13f85075 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Sun, 4 Jan 2026 12:26:04 -0500 Subject: [PATCH 1/3] Drop haf submodule, use existing HAF images from registry - Remove haf submodule dependency entirely - Use pre-built HAF Docker images from registry.gitlab.syncad.com/hive/haf - Derive image tag from first 8 chars of HAF_COMMIT - Add shm_dir relocation for cache format compatibility - Fix HAF registry path (remove /instance suffix) - Add COMMON_CI_REF for fetching common CI scripts --- .gitlab-ci.yml | 71 ++++++++++--------- .gitmodules | 3 - haf | 1 - .../ci-helpers/start-ci-test-environment.sh | 4 +- 4 files changed, 41 insertions(+), 38 deletions(-) delete mode 100644 .gitmodules delete mode 160000 haf diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1732346..a1ddee2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,10 +18,8 @@ variables: # Full clone (depth 0) enables efficient incremental fetches - shallow clones # don't reduce server CPU and make fetch less effective. GIT_STRATEGY: fetch - GIT_SUBMODULE_STRATEGY: normal + GIT_SUBMODULE_STRATEGY: none # HAF submodule no longer needed GIT_DEPTH: 0 - GIT_SUBMODULE_DEPTH: 0 - GIT_SUBMODULE_UPDATE_FLAGS: --jobs 4 # Temporary: separate clone path prevents clone-strategy jobs from erasing # fetch workspaces during transition. Remove once all projects use fetch. GIT_CLONE_PATH: $CI_BUILDS_DIR/fetch/$CI_RUNNER_SHORT_TOKEN/$CI_CONCURRENT_ID/$CI_PROJECT_PATH @@ -47,10 +45,13 @@ variables: PYTHON_IMAGE: "registry.gitlab.syncad.com/hive/hive/ci-base-image${PYTHON_IMAGE_TAG}" # Python 3.14 compatibility - allow PyO3-based packages to build PYO3_USE_ABI3_FORWARD_COMPATIBILITY: "1" - # HAF submodule commit - used for data caching and service containers - HAF_COMMIT: "70c30036c6a5d0cb882d05aa22f42523a8f34970" + # HAF configuration (no submodule needed) + # HAF_COMMIT: Used for data cache key lookup and Docker image tag (first 8 chars) + HAF_COMMIT: "219ef44775ee1ed85630cf513d8671f6ca4ccc49" # Known cached HAF commit # Enable CI-specific PostgreSQL config with reduced memory for HAF service containers HAF_CI_MODE: "1" + # Common CI configuration reference + COMMON_CI_REF: "develop" # ============================================================================= # QUICK TEST MODE # ============================================================================= @@ -171,28 +172,33 @@ lint_sql_scripts: validate_haf_commit: extends: .haf_commit_validation stage: build - variables: - # Uses HAF_COMMIT from global variables - no need to duplicate here - HAF_SUBMODULE_PATH: "haf" + # Uses HAF_COMMIT from global variables tags: - public-runner-docker +# Use existing HAF image from registry (no submodule needed) prepare_haf_image: stage: build - extends: .prepare_haf_image - variables: - SUBMODULE_DIR: "$CI_PROJECT_DIR/haf" - REGISTRY_USER: "$HAF_DEPLOY_USERNAME" - REGISTRY_PASS: "$HAF_DEPLOY_TOKEN" - GIT_SUBMODULE_STRATEGY: recursive - # Use fresh clone to avoid stale submodule state from previous jobs - GIT_STRATEGY: clone - before_script: - - git config --global --add safe.directory $CI_PROJECT_DIR/haf - - git config --global --add safe.directory $CI_PROJECT_DIR/haf/hive + image: alpine:latest + needs: + - job: validate_haf_commit + artifacts: true + script: + - | + # Derive image tag from HAF_COMMIT (first 8 characters) + HAF_SHORT_COMMIT="${HAF_COMMIT:0:8}" + HAF_IMAGE_NAME="registry.gitlab.syncad.com/hive/haf:${HAF_SHORT_COMMIT}" + echo "Using HAF image: ${HAF_IMAGE_NAME}" + # Export image name and tag for Docker Compose scripts + echo "HAF_IMAGE_NAME=${HAF_IMAGE_NAME}" > docker_image_name.env + echo "HAF_REGISTRY_TAG=${HAF_SHORT_COMMIT}" >> docker_image_name.env + artifacts: + reports: + dotenv: docker_image_name.env + paths: + - docker_image_name.env tags: - public-runner-docker - - bigjob extract-swagger-json: extends: .filter_out_swagger_json @@ -256,13 +262,12 @@ prepare_haf_data: - job: detect_changes artifacts: true - job: prepare_haf_image - optional: true # Can be skipped when AUTO_SKIP_BUILD=true + artifacts: true stage: build timeout: 80m variables: - SUBMODULE_DIR: "$CI_PROJECT_DIR/haf" BLOCK_LOG_SOURCE_DIR: $BLOCK_LOG_SOURCE_DIR_5M - CONFIG_INI_SOURCE: "$CI_PROJECT_DIR/haf/docker/config_5M.ini" + # Config fetched from hive repo by template if not specified locally # Note: The .prepare_haf_data_5m template already has caching logic # that skips replay if data exists. No additional skip logic needed. tags: @@ -343,16 +348,12 @@ sync: - job: detect_changes artifacts: true - job: prepare_haf_image - optional: true # Skip when only docs/tests changed + artifacts: true - job: prepare_haf_data - optional: true # Skip when only docs/tests changed + artifacts: true - docker-setup-docker-image-build - docker-ci-runner-build variables: - # Use clone strategy to avoid stale submodule state from fetch strategy - GIT_STRATEGY: clone - # Need recursive for haf/hive submodule (copy_datadir.sh symlink) - GIT_SUBMODULE_STRATEGY: recursive DATA_SOURCE: ${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT} BACKEND_VERSION: "$CI_COMMIT_SHORT_SHA" POSTGRES_ACCESS: postgresql://haf_admin@docker:5432/haf_block_log @@ -366,6 +367,13 @@ sync: - !reference [.haf_app_sync_setup, script] - !reference [.haf_app_fetch_haf_cache, script] script: + # Relocate shm_dir from inside datadir to be a sibling (for old cache formats) + - | + if [[ -d "${DATADIR}/shm_dir" ]] && [[ ! -d "${SHM_DIR}" ]]; then + echo "Relocating shm_dir to parallel location..." + mv "${DATADIR}/shm_dir" "${SHM_DIR}" + fi + ls -la "${SHM_DIR}/" || true # Copy blockchain and datadir - !reference [.haf_app_copy_blockchain, script] - !reference [.haf_app_copy_datadir, script] @@ -525,8 +533,7 @@ pattern-test: - | echo -e "\e[0Ksection_start:$(date +%s):tests_api[collapsed=true]\r\e[0KCloning tests_api repository..." cd "${CI_PROJECT_DIR}" - TESTS_API_DIR="${CI_PROJECT_DIR}/haf/hive/tests/python/hive-local-tools/tests_api" - mkdir -p "$(dirname "$TESTS_API_DIR")" + TESTS_API_DIR="${CI_PROJECT_DIR}/tests_api" rm -rf "$TESTS_API_DIR" git clone --depth 1 https://gitlab.syncad.com/hive/tests_api.git "$TESTS_API_DIR" echo -e "\e[0Ksection_end:$(date +%s):tests_api\r\e[0K" @@ -534,7 +541,7 @@ pattern-test: - | # Install tests_api for validate_response module (required by tavern tests) . venv/bin/activate - pip install -e "${CI_PROJECT_DIR}/haf/hive/tests/python/hive-local-tools/tests_api" + pip install -e "${CI_PROJECT_DIR}/tests_api" script: - | cd "${CI_PROJECT_DIR}" diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 3f9daf5..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "haf"] - path = haf - url = ../haf.git diff --git a/haf b/haf deleted file mode 160000 index 70c3003..0000000 --- a/haf +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 70c30036c6a5d0cb882d05aa22f42523a8f34970 diff --git a/scripts/ci-helpers/start-ci-test-environment.sh b/scripts/ci-helpers/start-ci-test-environment.sh index 451b8ef..b7ee26e 100755 --- a/scripts/ci-helpers/start-ci-test-environment.sh +++ b/scripts/ci-helpers/start-ci-test-environment.sh @@ -12,7 +12,7 @@ OPTIONS: --backend-version=VERSION HAF BE version (default: latest) --haf-data-directory=PATH HAF Data directory path (default: /srv/haf/data) --haf-shm-directory=PATH HAF SHM directory path (default: /srv/haf/shm) - --haf-registry=REGISTRY HAF registry to use (default: registry.gitlab.syncad.com/hive/haf/instance) + --haf-registry=REGISTRY HAF registry to use (default: registry.gitlab.syncad.com/hive/haf) --haf-version=VERSION HAF version to use (default: 9ec94375) --hived-uid=UID UID that hived daemon should be running as (default: $(id -u)) --help|-h|-? Display this help screen and exit @@ -66,7 +66,7 @@ cat <<-EOF | tee ci.env BACKEND_VERSION=${BACKEND_VERSION:-latest} HAF_DATA_DIRECTORY=${HAF_DATA_DIRECTORY:-/srv/haf/data} HAF_SHM_DIRECTORY=${HAF_SHM_DIRECTORY:-/srv/haf/shm} - HAF_REGISTRY=${HAF_REGISTRY_PATH:-registry.gitlab.syncad.com/hive/haf/instance} + HAF_REGISTRY=${HAF_REGISTRY_PATH:-registry.gitlab.syncad.com/hive/haf} HAF_VERSION=${HAF_REGISTRY_TAG:-9ec94375} HIVED_UID=${HIVED_UID:-$(id -u)} PGHERO_USERNAME=link -- GitLab From 3c8b2212e3dec92ecd42d19016ebf0c9fe2648e8 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Sun, 4 Jan 2026 12:30:38 -0500 Subject: [PATCH 2/3] Fix sync job: use smart cache lookup instead of submodule scripts - Replace .haf_app_fetch_haf_cache with .haf_app_smart_cache_lookup (extracts directly to job dir with permissions fixed) - Replace .haf_app_copy_blockchain/.haf_app_copy_datadir with inline scripts (these templates referenced haf/scripts/copy_datadir.sh from submodule) - Add cache hit detection to skip wait when app already synced - Use .haf_app_sync_save_cache_conditional for smarter cache saves - Increase timeout to 2 hours for NFS cache push --- .gitlab-ci.yml | 48 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a1ddee2..28c4a79 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -354,7 +354,6 @@ sync: - docker-setup-docker-image-build - docker-ci-runner-build variables: - DATA_SOURCE: ${DATA_CACHE_HAF_PREFIX}_${HAF_COMMIT} BACKEND_VERSION: "$CI_COMMIT_SHORT_SHA" POSTGRES_ACCESS: postgresql://haf_admin@docker:5432/haf_block_log COMPOSE_OPTIONS_STRING: --env-file ci.env --file docker-compose.yml --file overrides/dev.yml @@ -362,33 +361,52 @@ sync: # Aliases for sync templates APP_SYNC_CACHE_TYPE: "${REPTRACKER_SYNC_CACHE_TYPE}" APP_CACHE_KEY: "${REPTRACKER_CACHE_KEY}" - timeout: 1 hours + timeout: 2 hours # NFS cache push can take 50+ minutes on slow days before_script: - !reference [.haf_app_sync_setup, script] - - !reference [.haf_app_fetch_haf_cache, script] + - !reference [.haf_app_smart_cache_lookup, script] script: - # Relocate shm_dir from inside datadir to be a sibling (for old cache formats) - | + CACHE_HIT=$(cat /tmp/cache_hit 2>/dev/null || echo "false") + + echo -e "\e[0Ksection_start:$(date +%s):compose[collapsed=true]\r\e[0KStarting the test environment..." + + # cache-manager extracted directly to job directory with block_log symlinks and pgdata permissions fixed + echo "Data ready at DATADIR=${DATADIR}" + ls -la "${DATADIR}/" || true + + # Relocate shm_dir from inside datadir to be a sibling (for old cache formats) if [[ -d "${DATADIR}/shm_dir" ]] && [[ ! -d "${SHM_DIR}" ]]; then echo "Relocating shm_dir to parallel location..." mv "${DATADIR}/shm_dir" "${SHM_DIR}" fi ls -la "${SHM_DIR}/" || true - # Copy blockchain and datadir - - !reference [.haf_app_copy_blockchain, script] - - !reference [.haf_app_copy_datadir, script] - # Start app-specific environment - - | - echo -e "\e[0Ksection_start:$(date +%s):start_env[collapsed=true]\r\e[0KStarting the test environment..." + + # Docker Compose bind mounts docker/blockchain over datadir/blockchain. + # Copy blockchain files there so HAF can see them. + # Use cp -aL to dereference symlinks (copy actual files, not symlink references) + echo "Copying blockchain files to docker/blockchain..." + rm -rf "${CI_PROJECT_DIR}/docker/blockchain"/* + cp -aL "${DATADIR}/blockchain"/* "${CI_PROJECT_DIR}/docker/blockchain/" + ls -la "${CI_PROJECT_DIR}/docker/blockchain/" + + # Remove blockchain dir from datadir so Docker volume mount doesn't conflict + sudo rm -rf "${DATADIR}/blockchain" + "${CI_PROJECT_DIR}/scripts/ci-helpers/start-ci-test-environment.sh" - echo -e "\e[0Ksection_end:$(date +%s):start_env\r\e[0K" + + echo -e "\e[0Ksection_end:$(date +%s):compose\r\e[0K" - | - echo -e "\e[0Ksection_start:$(date +%s):wait[collapsed=true]\r\e[0KWaiting for reputation tracker..." - "${CI_PROJECT_DIR}/scripts/ci-helpers/wait-for-rt-startup.sh" - echo -e "\e[0Ksection_end:$(date +%s):wait\r\e[0K" + if [[ "$CACHE_HIT" == "true" ]]; then + echo "Cache hit - reputation_tracker already synced, skipping wait" + else + echo -e "\e[0Ksection_start:$(date +%s):wait[collapsed=true]\r\e[0KWaiting for reputation tracker..." + "${CI_PROJECT_DIR}/scripts/ci-helpers/wait-for-rt-startup.sh" + echo -e "\e[0Ksection_end:$(date +%s):wait\r\e[0K" + fi # Shutdown and save cache using templates - !reference [.haf_app_sync_shutdown, script] - - !reference [.haf_app_sync_save_cache, script] + - !reference [.haf_app_sync_save_cache_conditional, script] after_script: !reference [.haf_app_sync_cleanup, after_script] artifacts: !reference [.haf_app_sync_artifacts, artifacts] tags: -- GitLab From 3c1e12bc40b9f0074f52b78dd4cc5128e13ad352 Mon Sep 17 00:00:00 2001 From: Dan Notestein Date: Sun, 4 Jan 2026 12:40:15 -0500 Subject: [PATCH 3/3] Fix test jobs: use fallback cache extraction logic The common-ci-configuration template expects exact cache key match, but when sync job finds an existing cache (same HAF commit, different app commit), it reuses that cache without saving a new one. Test jobs then look for exact key which doesn't exist. This fix adds fallback logic: try exact key first, then search for any compatible cache with same HAF commit. --- .gitlab-ci.yml | 86 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 82 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 28c4a79..68e715e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -414,16 +414,94 @@ sync: - fast # ============================================================================= -# DinD Test Template (Phase 4 migration) +# DinD Test Template # ============================================================================= -# Uses composable templates from common-ci-configuration instead of inline scripts. -# This reduces duplication and centralizes maintenance. +# Custom template with fallback cache extraction logic. +# The common-ci-configuration template expects exact cache key match, but when +# sync job finds an existing cache (same HAF commit, different app commit), +# we need fallback logic to find any compatible cache. .test-with-docker-compose: extends: - - .haf_app_dind_complete_test + - .docker_image_builder_job_template + - .haf_app_dind_test_variables + stage: test image: registry.gitlab.syncad.com/hive/reputation_tracker/ci-runner:docker-24.0.1-8 variables: COMPOSE_OPTIONS_STRING: "--file docker-compose-test.yml --ansi never" + timeout: 30 minutes + before_script: + - !reference [.docker_image_builder_job_template, before_script] + - | + echo -e "\e[0Ksection_start:$(date +%s):login[collapsed=true]\r\e[0KLogging to Docker registry..." + docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY + echo -e "\e[0Ksection_end:$(date +%s):login\r\e[0K" + # Cache extraction with fallback logic + - | + echo -e "\e[0Ksection_start:$(date +%s):extract[collapsed=true]\r\e[0KExtracting cache for DinD test..." + JOB_DIR="${CI_PROJECT_DIR}/${CI_JOB_ID}" + + # Fetch cache-manager + if [[ ! -x "$CACHE_MANAGER" ]]; then + mkdir -p "$(dirname "$CACHE_MANAGER")" + curl -fsSL "https://gitlab.syncad.com/hive/common-ci-configuration/-/raw/${CACHE_MANAGER_REF:-develop}/scripts/cache-manager.sh" -o "$CACHE_MANAGER" + chmod +x "$CACHE_MANAGER" + fi + + mkdir -p "${JOB_DIR}" + + # Try exact cache key first + EXACT_KEY="${HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}" + echo "Trying exact cache key: ${APP_SYNC_CACHE_TYPE}/${EXACT_KEY}" + if CACHE_HANDLING=haf "$CACHE_MANAGER" get "${APP_SYNC_CACHE_TYPE}" "${EXACT_KEY}" "${JOB_DIR}" 2>/dev/null; then + echo "Found exact cache match" + else + # Fallback: search for any cache with same HAF commit + echo "Exact key not found, searching for compatible cache with HAF commit ${HAF_COMMIT}..." + NFS_CACHE_DIR="${DATA_CACHE_NFS_PREFIX}/${APP_SYNC_CACHE_TYPE}" + FOUND_CACHE=$(ls -t "${NFS_CACHE_DIR}/${HAF_COMMIT}_"*.tar 2>/dev/null | head -1 || true) + if [[ -n "$FOUND_CACHE" ]]; then + FOUND_KEY=$(basename "$FOUND_CACHE" .tar) + echo "Found compatible cache: ${FOUND_KEY}" + if ! CACHE_HANDLING=haf "$CACHE_MANAGER" get "${APP_SYNC_CACHE_TYPE}" "${FOUND_KEY}" "${JOB_DIR}"; then + echo "ERROR: Failed to extract compatible cache" + exit 1 + fi + else + echo "ERROR: No compatible cache found in ${NFS_CACHE_DIR}/${HAF_COMMIT}_*.tar" + exit 1 + fi + fi + + # Relocate shm_dir from inside datadir to be a sibling (for old cache formats) + if [[ -d "${HAF_DATA_DIRECTORY}/shm_dir" ]] && [[ ! -d "${HAF_SHM_DIRECTORY}" ]]; then + echo "Relocating shm_dir to parallel location..." + mv "${HAF_DATA_DIRECTORY}/shm_dir" "${HAF_SHM_DIRECTORY}" + fi + + # Handle blockchain - copy to docker directory + mkdir -p "${CI_PROJECT_DIR}/docker/blockchain" + if [[ -d "${HAF_DATA_DIRECTORY}/blockchain" ]] && [[ -n "$(ls -A "${HAF_DATA_DIRECTORY}/blockchain" 2>/dev/null)" ]]; then + echo "Copying blockchain from cache to docker directory..." + cp -aL "${HAF_DATA_DIRECTORY}/blockchain"/* "${CI_PROJECT_DIR}/docker/blockchain/" + elif [[ -d "${BLOCK_LOG_SOURCE_DIR_5M}" ]]; then + echo "Symlinking blockchain from source..." + ln -sfn "${BLOCK_LOG_SOURCE_DIR_5M}/block_log" "${CI_PROJECT_DIR}/docker/blockchain/block_log" + ln -sfn "${BLOCK_LOG_SOURCE_DIR_5M}/block_log.artifacts" "${CI_PROJECT_DIR}/docker/blockchain/block_log.artifacts" + fi + ls -la "${CI_PROJECT_DIR}/docker/blockchain/" + + echo -e "\e[0Ksection_end:$(date +%s):extract\r\e[0K" + - !reference [.haf_app_dind_compose_startup, script] + - !reference [.haf_app_dind_wait_for_services, script] + after_script: !reference [.haf_app_dind_compose_teardown, after_script] + artifacts: + when: always + paths: + - docker/container-logs.tar.gz + expire_in: 1 week + tags: + - data-cache-storage + - fast python_api_client_test: extends: .configuration_template -- GitLab