diff --git a/scripts/cache-manager.sh b/scripts/cache-manager.sh index d4a5003cf1a23885cdd8b9a646de4f4d983c9778..b31afe7dd35054806b271d49410710686cf942e4 100755 --- a/scripts/cache-manager.sh +++ b/scripts/cache-manager.sh @@ -20,12 +20,14 @@ # Cache types: hive, haf, balance_tracker, hivemind, etc. # # Environment variables: -# CACHE_NFS_PATH - NFS mount point (default: /nfs/ci-cache) -# CACHE_LOCAL_PATH - Local cache directory (default: /cache) -# CACHE_MAX_SIZE_GB - Max total NFS cache size (default: 2000) -# CACHE_MAX_AGE_DAYS - Max cache age (default: 30) -# CACHE_LOCK_TIMEOUT - Lock timeout in seconds (default: 3600) -# CACHE_QUIET - Suppress verbose output (default: false) +# CACHE_NFS_PATH - NFS mount point (default: /nfs/ci-cache) +# CACHE_LOCAL_PATH - Local cache directory (default: /cache) +# CACHE_MAX_SIZE_GB - Max total NFS cache size (default: 2000) +# CACHE_MAX_AGE_DAYS - Max cache age (default: 30) +# CACHE_LOCK_TIMEOUT - Lock timeout in seconds (default: 3600) +# CACHE_QUIET - Suppress verbose output (default: false) +# SHARED_BLOCK_LOG_LOCAL - Local block_log path (default: /blockchain/block_log_5m) +# SHARED_BLOCK_LOG_NFS - NFS block_log path (default: /nfs/ci-cache/hive/blockchain/block_log_5m) set -euo pipefail @@ -62,6 +64,10 @@ CACHE_LOCK_TIMEOUT="${CACHE_LOCK_TIMEOUT:-120}" # 2 minutes (NFS writes take ~1 CACHE_STALE_LOCK_MINUTES="${CACHE_STALE_LOCK_MINUTES:-10}" # Break locks older than this (writes take ~10s) CACHE_QUIET="${CACHE_QUIET:-false}" +# Shared block_log locations (used when blockchain excluded from cache) +SHARED_BLOCK_LOG_LOCAL="${SHARED_BLOCK_LOG_LOCAL:-/blockchain/block_log_5m}" +SHARED_BLOCK_LOG_NFS="${SHARED_BLOCK_LOG_NFS:-/nfs/ci-cache/hive/blockchain/block_log_5m}" + # Logging _log() { if [[ "$CACHE_QUIET" != "true" ]]; then @@ -403,18 +409,17 @@ _link_shared_block_log() { fi # Shared block_log locations - check local first (faster), then NFS - local LOCAL_BLOCK_LOG="/blockchain/block_log_5m" - local NFS_BLOCK_LOG="/nfs/ci-cache/hive/block_log_5m" + # Paths are configurable via SHARED_BLOCK_LOG_LOCAL and SHARED_BLOCK_LOG_NFS env vars local shared_block_log="" - if [[ -d "$LOCAL_BLOCK_LOG" ]] && [[ -n "$(ls -A "$LOCAL_BLOCK_LOG" 2>/dev/null)" ]]; then - shared_block_log="$LOCAL_BLOCK_LOG" - _log "Using local shared block_log: $LOCAL_BLOCK_LOG" - elif [[ -d "$NFS_BLOCK_LOG" ]] && [[ -n "$(ls -A "$NFS_BLOCK_LOG" 2>/dev/null)" ]]; then - shared_block_log="$NFS_BLOCK_LOG" - _log "Using NFS shared block_log: $NFS_BLOCK_LOG" + if [[ -d "$SHARED_BLOCK_LOG_LOCAL" ]] && [[ -n "$(ls -A "$SHARED_BLOCK_LOG_LOCAL" 2>/dev/null)" ]]; then + shared_block_log="$SHARED_BLOCK_LOG_LOCAL" + _log "Using local shared block_log: $SHARED_BLOCK_LOG_LOCAL" + elif [[ -d "$SHARED_BLOCK_LOG_NFS" ]] && [[ -n "$(ls -A "$SHARED_BLOCK_LOG_NFS" 2>/dev/null)" ]]; then + shared_block_log="$SHARED_BLOCK_LOG_NFS" + _log "Using NFS shared block_log: $SHARED_BLOCK_LOG_NFS" else - _log "WARNING: No shared block_log found at $LOCAL_BLOCK_LOG or $NFS_BLOCK_LOG" + _log "WARNING: No shared block_log found at $SHARED_BLOCK_LOG_LOCAL or $SHARED_BLOCK_LOG_NFS" return 0 fi @@ -478,13 +483,10 @@ cmd_get() { local is_nfs_host=false _is_nfs_host && is_nfs_host=true - # Determine which tar file to use (local cache or NFS) - local source_tar="" - - # 1. Check local tar cache first (on NFS host, this IS the NFS tar) + # 1. Ensure we have a local tar file (copy from NFS if needed) + # Always extract from local for faster I/O if [[ -f "$LOCAL_TAR_FILE" ]]; then _log "Local cache hit: $LOCAL_TAR_FILE" - source_tar="$LOCAL_TAR_FILE" elif [[ "$is_nfs_host" == "true" ]]; then # On NFS host, local and NFS are the same - if local miss, it's a miss _log "NFS host cache miss: $NFS_TAR_FILE" @@ -493,20 +495,32 @@ cmd_get() { _log "NFS not available, cache miss" return 1 elif [[ -f "$NFS_TAR_FILE" ]]; then - _log "NFS cache hit: $NFS_TAR_FILE" - source_tar="$NFS_TAR_FILE" + # Copy NFS tar to local FIRST, then extract from local (faster) + _log "NFS cache hit: $NFS_TAR_FILE - copying to local cache" + mkdir -p "$(dirname "$LOCAL_TAR_FILE")" + local copy_start=$(date +%s.%N) + if cp "$NFS_TAR_FILE" "$LOCAL_TAR_FILE"; then + local copy_end=$(date +%s.%N) + local copy_duration=$(echo "$copy_end - $copy_start" | bc) + local tar_size=$(stat -c %s "$LOCAL_TAR_FILE" 2>/dev/null || echo 0) + local throughput=$(echo "scale=2; $tar_size / 1024 / 1024 / $copy_duration" | bc 2>/dev/null || echo '?') + _log "Copied to local cache in ${copy_duration}s (${throughput} MB/s)" + else + _error "Failed to copy NFS tar to local cache" + return 1 + fi else _log "Cache miss: $NFS_TAR_FILE" return 1 fi - # 2. Clean up stale extraction and extract tar to destination + # 2. Clean up stale extraction and extract from LOCAL tar # Previous runs may have left directories with postgres ownership (UID 105, mode 700) # that the current user can't write to - clean those up first _cleanup_stale_extraction "$local_dest" mkdir -p "$local_dest" - local tar_lock="${source_tar}.lock" + local tar_lock="${LOCAL_TAR_FILE}.lock" _touch_lock "$tar_lock" local get_start_time=$(date +%s.%N) @@ -514,12 +528,12 @@ cmd_get() { lock_acquired=\$(date +%s.%N) echo \"[cache-manager] Shared lock acquired in \$(echo \"\$lock_acquired - $get_start_time\" | bc)s\" >&2 - tar_size=\$(stat -c %s '$source_tar' 2>/dev/null || echo 0) + tar_size=\$(stat -c %s '$LOCAL_TAR_FILE' 2>/dev/null || echo 0) tar_size_gb=\$(echo \"scale=2; \$tar_size / 1024 / 1024 / 1024\" | bc) echo \"[cache-manager] Extracting (\${tar_size_gb}GB) to: $local_dest\" >&2 extract_start=\$(date +%s.%N) - tar xf '$source_tar' -C '$local_dest' + tar xf '$LOCAL_TAR_FILE' -C '$local_dest' extract_end=\$(date +%s.%N) extract_duration=\$(echo \"\$extract_end - \$extract_start\" | bc) throughput=\$(echo \"scale=2; \$tar_size / 1024 / 1024 / \$extract_duration\" | bc 2>/dev/null || echo '?') @@ -531,14 +545,6 @@ cmd_get() { return 1 fi - # 3. Copy NFS tar to local cache for future use (skip if already local or on NFS host) - if [[ "$source_tar" == "$NFS_TAR_FILE" && "$LOCAL_TAR_FILE" != "$NFS_TAR_FILE" && ! -f "$LOCAL_TAR_FILE" ]]; then - mkdir -p "$(dirname "$LOCAL_TAR_FILE")" - if cp "$NFS_TAR_FILE" "$LOCAL_TAR_FILE" 2>/dev/null; then - _log "Cached locally: $LOCAL_TAR_FILE" - fi - fi - # Post-extraction fixes for HAF caches # Covers: haf, haf_sync, haf_pipeline, haf_filtered, haf_hafbe_sync, etc. if [[ "$cache_type" == haf* ]]; then diff --git a/templates/haf_app_testing.gitlab-ci.yml b/templates/haf_app_testing.gitlab-ci.yml index d4411c9ea7791bc9dca5da86525c61a0307a4b2e..fe45383025b3f54a4d9fff48206b5ca632209c23 100644 --- a/templates/haf_app_testing.gitlab-ci.yml +++ b/templates/haf_app_testing.gitlab-ci.yml @@ -285,8 +285,8 @@ include: echo "" echo "=== Can skip data prep (only tests/docs changed) ===" CAN_SKIP_BUILD=true - # Use current HAF submodule commit as cache key - CACHE_COMMIT="${HAF_COMMIT}" + # Don't set CACHE_COMMIT here - smart_cache_lookup will find an available cache + # The app's AUTO_CACHE_HAF_COMMIT variable (if set) will be used else echo "" echo "=== Files requiring full build: ===" @@ -294,12 +294,17 @@ include: fi fi - # Write dotenv artifact (output multiple names for compatibility across apps) + # Write dotenv artifact + # AUTO_SKIP_SYNC tells downstream jobs they can use cached data + # AUTO_CACHE_HAF_COMMIT is only set for QUICK_TEST mode (explicit cache selection) + # For auto-skip mode, smart_cache_lookup will find an available cache echo "AUTO_SKIP_BUILD=${CAN_SKIP_BUILD}" > detect_changes.env echo "AUTO_SKIP_SYNC=${CAN_SKIP_BUILD}" >> detect_changes.env - echo "CACHE_COMMIT=${CACHE_COMMIT}" >> detect_changes.env - echo "AUTO_CACHE_HAF_COMMIT=${CACHE_COMMIT}" >> detect_changes.env - echo "AUTO_CACHE_COMMIT=${CACHE_COMMIT}" >> detect_changes.env + if [ -n "$CACHE_COMMIT" ]; then + echo "CACHE_COMMIT=${CACHE_COMMIT}" >> detect_changes.env + echo "AUTO_CACHE_HAF_COMMIT=${CACHE_COMMIT}" >> detect_changes.env + echo "AUTO_CACHE_COMMIT=${CACHE_COMMIT}" >> detect_changes.env + fi echo "" echo "=== Detection Results ===" @@ -780,10 +785,11 @@ include: echo "Using cached HAF data from: ${EFFECTIVE_HAF_COMMIT}" fi - # Build cache paths + # Build cache key and job data directory + # Cache-manager extracts directly to job directory - no copy needed + # DATADIR and SHM_DIR are children of JOB_DATA_DIR EFFECTIVE_CACHE_KEY="${EFFECTIVE_HAF_COMMIT}_${CI_COMMIT_SHORT_SHA}" - LOCAL_APP_CACHE="${DATA_CACHE_HAF_PREFIX}_${APP_SYNC_CACHE_TYPE}_${EFFECTIVE_CACHE_KEY}" - LOCAL_HAF_CACHE="${DATA_CACHE_HAF_PREFIX}_${EFFECTIVE_HAF_COMMIT}" + JOB_DATA_DIR="${CI_PROJECT_DIR}/${CI_JOB_ID}" # Fetch cache-manager if [[ ! -x "$CACHE_MANAGER" ]]; then @@ -793,56 +799,46 @@ include: CACHE_HIT="false" - # Check for existing app sync cache (local first, then NFS) - if [[ -d "${LOCAL_APP_CACHE}/datadir" ]]; then - echo "Local app sync cache found at ${LOCAL_APP_CACHE} - skipping sync" + # Check for existing app sync cache - extract directly to job directory + echo "Checking for app sync cache: ${APP_SYNC_CACHE_TYPE}/${EFFECTIVE_CACHE_KEY}" + if "$CACHE_MANAGER" get "${APP_SYNC_CACHE_TYPE}" "${EFFECTIVE_CACHE_KEY}" "${JOB_DATA_DIR}" 2>/dev/null; then + echo "App sync cache found - skipping sync" CACHE_HIT="true" - export DATA_SOURCE="${LOCAL_APP_CACHE}" - else - echo "Checking NFS for app sync cache: ${APP_SYNC_CACHE_TYPE}/${EFFECTIVE_CACHE_KEY}" - if CACHE_HANDLING=haf "$CACHE_MANAGER" get "${APP_SYNC_CACHE_TYPE}" "${EFFECTIVE_CACHE_KEY}" "${LOCAL_APP_CACHE}" 2>/dev/null; then - echo "Fetched app sync cache from NFS - skipping sync" - CACHE_HIT="true" - export DATA_SOURCE="${LOCAL_APP_CACHE}" - elif [[ "${AUTO_SKIP_SYNC:-false}" == "true" ]]; then - # For docs-only changes, search for ANY app cache with matching HAF commit - # since app code hasn't changed - echo "Exact cache not found, searching for compatible app cache with HAF commit ${EFFECTIVE_HAF_COMMIT}..." - NFS_CACHE_DIR="${DATA_CACHE_NFS_PREFIX}/${APP_SYNC_CACHE_TYPE}" - FOUND_CACHE=$(ls -t "${NFS_CACHE_DIR}/${EFFECTIVE_HAF_COMMIT}_"*.tar 2>/dev/null | head -1 || true) - if [[ -n "$FOUND_CACHE" ]]; then - FOUND_KEY=$(basename "$FOUND_CACHE" .tar) - echo "Found compatible app cache: ${FOUND_KEY}" - LOCAL_APP_CACHE="${DATA_CACHE_HAF_PREFIX}_${APP_SYNC_CACHE_TYPE}_${FOUND_KEY}" - if CACHE_HANDLING=haf "$CACHE_MANAGER" get "${APP_SYNC_CACHE_TYPE}" "${FOUND_KEY}" "${LOCAL_APP_CACHE}" 2>/dev/null; then - echo "Fetched compatible app sync cache from NFS - skipping sync" - CACHE_HIT="true" - export DATA_SOURCE="${LOCAL_APP_CACHE}" - fi + elif [[ "${AUTO_SKIP_SYNC:-false}" == "true" ]]; then + # For docs-only changes, search for ANY app cache with matching HAF commit + echo "Exact cache not found, searching for compatible app cache with HAF commit ${EFFECTIVE_HAF_COMMIT}..." + NFS_CACHE_DIR="${DATA_CACHE_NFS_PREFIX}/${APP_SYNC_CACHE_TYPE}" + FOUND_CACHE=$(ls -t "${NFS_CACHE_DIR}/${EFFECTIVE_HAF_COMMIT}_"*.tar 2>/dev/null | head -1 || true) + if [[ -n "$FOUND_CACHE" ]]; then + FOUND_KEY=$(basename "$FOUND_CACHE" .tar) + echo "Found compatible app cache: ${FOUND_KEY}" + EFFECTIVE_CACHE_KEY="${FOUND_KEY}" # Update for save_cache + if "$CACHE_MANAGER" get "${APP_SYNC_CACHE_TYPE}" "${FOUND_KEY}" "${JOB_DATA_DIR}" 2>/dev/null; then + echo "Fetched compatible app sync cache - skipping sync" + CACHE_HIT="true" fi - if [[ "$CACHE_HIT" != "true" ]]; then - echo "No compatible app cache found, will use HAF-only cache and run sync" - fi - else - echo "App sync cache not found, will use HAF-only cache and run sync" fi + if [[ "$CACHE_HIT" != "true" ]]; then + echo "No compatible app cache found, will use HAF-only cache and run sync" + fi + else + echo "App sync cache not found, will use HAF-only cache and run sync" fi # Fall back to HAF-only cache if no app cache found if [[ "$CACHE_HIT" != "true" ]]; then - if [[ -d "${LOCAL_HAF_CACHE}/datadir" ]]; then - echo "Local HAF cache found at ${LOCAL_HAF_CACHE}" + echo "Fetching HAF cache..." + if "$CACHE_MANAGER" get haf "${EFFECTIVE_HAF_COMMIT}" "${JOB_DATA_DIR}"; then + echo "HAF replay data ready" else - echo "Local HAF cache not found, checking NFS..." - if "$CACHE_MANAGER" get haf "${EFFECTIVE_HAF_COMMIT}" "${LOCAL_HAF_CACHE}"; then - echo "Fetched HAF replay data from NFS cache" - else - echo "ERROR: Failed to fetch HAF replay data from NFS cache" - exit 1 - fi + echo "ERROR: Failed to fetch HAF replay data" + exit 1 fi fi + # DATA_SOURCE points to where data was extracted (for compatibility) + export DATA_SOURCE="${JOB_DATA_DIR}" + # Export state for script section echo "$CACHE_HIT" > /tmp/cache_hit echo "$EFFECTIVE_CACHE_KEY" > /tmp/effective_cache_key