From 19e99128c1003a531d45867a8e565e926ac4539b Mon Sep 17 00:00:00 2001
From: Dan Notestein <dan@syncad.com>
Date: Sun, 4 Jan 2026 03:01:37 -0500
Subject: [PATCH 1/2] Add haf-app-tools: shared utilities for HAF applications

This adds a new haf-app-tools directory containing scripts and utilities
that HAF applications (balance_tracker, haf_block_explorer, hafah, hivemind)
currently obtain via the haf submodule.

By centralizing these tools here, apps can fetch them at runtime instead
of including the entire haf repository as a submodule, significantly
simplifying the dependency chain.

Contents:
- scripts/common.sh: Shared bash utilities (logging, git clone helpers)
- scripts/create_haf_app_role.sh: PostgreSQL role creation for HAF apps
- scripts/copy_datadir.sh: Data directory copying with NFS cache fallback
- python/process_openapi.py: OpenAPI to SQL/nginx generator
- config/config_5M.ini: Standard hived config for 5M block testing

Also includes migration documentation in docs/haf-app-tools-migration.md.
---
 docs/haf-app-tools-migration.md              | 205 +++++++
 haf-app-tools/README.md                      |  72 +++
 haf-app-tools/config/config_5M.ini           |  41 ++
 haf-app-tools/python/process_openapi.py      | 571 +++++++++++++++++++
 haf-app-tools/python/requirements.txt        |   4 +
 haf-app-tools/scripts/common.sh              |  86 +++
 haf-app-tools/scripts/copy_datadir.sh        | 206 +++++++
 haf-app-tools/scripts/create_haf_app_role.sh | 125 ++++
 8 files changed, 1310 insertions(+)
 create mode 100644 docs/haf-app-tools-migration.md
 create mode 100644 haf-app-tools/README.md
 create mode 100644 haf-app-tools/config/config_5M.ini
 create mode 100644 haf-app-tools/python/process_openapi.py
 create mode 100644 haf-app-tools/python/requirements.txt
 create mode 100644 haf-app-tools/scripts/common.sh
 create mode 100644 haf-app-tools/scripts/copy_datadir.sh
 create mode 100644 haf-app-tools/scripts/create_haf_app_role.sh

diff --git a/docs/haf-app-tools-migration.md b/docs/haf-app-tools-migration.md
new file mode 100644
index 0000000..76e72c7
--- /dev/null
+++ b/docs/haf-app-tools-migration.md
@@ -0,0 +1,205 @@
+# HAF App Tools Migration Plan
+
+This document outlines the migration of shared HAF application utilities from the `haf` repository to `common-ci-configuration/haf-app-tools/`.
+
+## Motivation
+
+Currently, HAF applications (balance_tracker, haf_block_explorer, hafah, hivemind) include the `haf` repo as a submodule primarily to access:
+- Shared scripts (`common.sh`, `create_haf_app_role.sh`, `copy_datadir.sh`)
+- OpenAPI processor (`process_openapi.py`)
+- Config files (`config_5M.ini`)
+- Test infrastructure (`tests_api`)
+
+This creates a deep dependency chain: `app → haf → hive → test-tools/tests_api`
+
+By moving the shared utilities to `common-ci-configuration`, apps can:
+1. Drop the `haf` submodule entirely
+2. Fetch utilities at runtime (like existing `HIVE_SCRIPTS_REF` pattern)
+3. Only add `tests_api` as a direct submodule if needed for testing
+
+## Directory Structure
+
+```
+common-ci-configuration/
+├── haf-app-tools/
+│   ├── scripts/
+│   │   ├── common.sh               # Shared bash utilities
+│   │   ├── create_haf_app_role.sh  # PostgreSQL role setup
+│   │   └── copy_datadir.sh         # Data directory copying with NFS fallback
+│   ├── python/
+│   │   ├── process_openapi.py      # OpenAPI → SQL/nginx generator
+│   │   └── requirements.txt        # deepmerge, jsonpointer, pyyaml
+│   └── config/
+│       └── config_5M.ini           # Standard 5M block HAF config
+```
+
+## Files to Migrate
+
+### From `haf/scripts/`
+
+| File | Purpose | Dependencies |
+|------|---------|--------------|
+| `common.sh` | Utility functions: `log_exec_params`, `do_clone_commit`, `do_clone_branch` | None |
+| `create_haf_app_role.sh` | Creates HAF app PostgreSQL roles with proper group membership | Sources `common.sh` |
+| `copy_datadir.sh` | Copies data directories with NFS fallback, fixes pg_tblspc symlinks | Uses `cache-manager.sh` (already in common-ci-configuration) |
+
+### From `haf/scripts/`
+
+| File | Purpose | Dependencies |
+|------|---------|--------------|
+| `process_openapi.py` | Generates SQL types/functions and nginx rewrite rules from OpenAPI YAML in SQL comments | `deepmerge`, `jsonpointer`, `pyyaml` |
+
+### From `haf/docker/`
+
+| File | Purpose | Dependencies |
+|------|---------|--------------|
+| `config_5M.ini` | Standard hived config for 5M block replay testing | None |
+
+## Required Modifications
+
+### 1. `create_haf_app_role.sh`
+
+Change the source line to fetch `common.sh` at runtime:
+
+```bash
+# Before (line 6):
+source "$SCRIPTPATH/common.sh"
+
+# After:
+COMMON_CI_URL="${COMMON_CI_URL:-https://gitlab.syncad.com/hive/common-ci-configuration/-/raw/develop}"
+if [[ ! -f "$SCRIPTPATH/common.sh" ]]; then
+    curl -fsSL "${COMMON_CI_URL}/haf-app-tools/scripts/common.sh" -o /tmp/common.sh
+    source /tmp/common.sh
+else
+    source "$SCRIPTPATH/common.sh"
+fi
+```
+
+### 2. `copy_datadir.sh`
+
+Already fetches `cache-manager.sh` from common-ci-configuration. No changes needed.
+
+### 3. `process_openapi.py`
+
+No changes needed to the script itself. Apps need to install dependencies:
+```bash
+pip install deepmerge jsonpointer pyyaml
+```
+
+## How Apps Will Fetch Tools
+
+### CI Variable Setup
+
+Apps should define in their `.gitlab-ci.yml`:
+
+```yaml
+variables:
+  # Reference to common-ci-configuration for fetching tools
+  COMMON_CI_REF: "develop"  # or pin to a specific commit
+  COMMON_CI_URL: "https://gitlab.syncad.com/hive/common-ci-configuration/-/raw/${COMMON_CI_REF}"
+```
+
+### Fetching Scripts
+
+```yaml
+.fetch_haf_app_tools:
+  before_script:
+    - mkdir -p /tmp/haf-app-tools
+    - curl -fsSL "${COMMON_CI_URL}/haf-app-tools/scripts/common.sh" -o /tmp/haf-app-tools/common.sh
+    - curl -fsSL "${COMMON_CI_URL}/haf-app-tools/scripts/create_haf_app_role.sh" -o /tmp/haf-app-tools/create_haf_app_role.sh
+    - curl -fsSL "${COMMON_CI_URL}/haf-app-tools/scripts/copy_datadir.sh" -o /tmp/haf-app-tools/copy_datadir.sh
+    - curl -fsSL "${COMMON_CI_URL}/haf-app-tools/python/process_openapi.py" -o /tmp/haf-app-tools/process_openapi.py
+    - curl -fsSL "${COMMON_CI_URL}/haf-app-tools/config/config_5M.ini" -o /tmp/haf-app-tools/config_5M.ini
+    - chmod +x /tmp/haf-app-tools/*.sh
+```
+
+### Using in Jobs
+
+```yaml
+some_job:
+  extends: .fetch_haf_app_tools
+  script:
+    - /tmp/haf-app-tools/create_haf_app_role.sh --postgres-url="$POSTGRES_URL" --haf-app-account="myapp"
+    - python3 /tmp/haf-app-tools/process_openapi.py output/ endpoints/*.sql
+```
+
+## Migration Steps Per App
+
+### balance_tracker
+
+1. **Add `tests_api` submodule** (for `validate_response` module):
+   ```bash
+   git submodule add ../tests_api.git tests_api
+   ```
+
+2. **Update `.gitlab-ci.yml`**:
+   - Add `COMMON_CI_REF` variable
+   - Change `pip install -e "${CI_PROJECT_DIR}/haf/hive/tests/python/hive-local-tools/tests_api"` to `pip install -e "${CI_PROJECT_DIR}/tests_api"`
+   - Change `CONFIG_INI_SOURCE: "$CI_PROJECT_DIR/haf/docker/config_5M.ini"` to fetch from common-ci-configuration
+
+3. **Update `scripts/openapi_rewrite.sh`**:
+   - Change `python3 $haf_dir/scripts/process_openapi.py` to use fetched script
+
+4. **Remove haf submodule**:
+   ```bash
+   git submodule deinit haf
+   git rm haf
+   rm -rf .git/modules/haf
+   ```
+
+5. **Clean up**:
+   - Remove haf-related entries from `.gitmodules`
+   - Remove pre_get_sources hook logic for haf submodule corruption
+   - Remove git safe.directory entries for haf
+
+### haf_block_explorer
+
+Same as balance_tracker, plus:
+- Update `submodules/haf` path to new approach
+- May need to update nested submodule handling (btracker, hafah, reptracker)
+
+### hafah
+
+1. **Add `tests_api` submodule**
+2. **Update Dockerfile.setup**:
+   - Change `COPY haf/scripts/common.sh` to fetch at build time
+   - Change `COPY haf/scripts/create_haf_app_role.sh` similarly
+3. **Update `.gitlab-ci.yml`** as above
+4. **Update `scripts/openapi_rewrite.sh`**
+5. **Remove haf submodule**
+
+### hivemind
+
+1. **Update `scripts/ci-helpers/build_instance.sh`**:
+   - Change `source "$SCRIPTSDIR/../haf/scripts/common.sh"` to fetch from common-ci-configuration
+2. **Update `scripts/setup_postgres.sh`**:
+   - Change calls to `haf/scripts/create_haf_app_role.sh`
+3. **Update `.gitlab-ci.yml`**:
+   - Change `CONFIG_INI_SOURCE` and `copy_datadir.sh` references
+4. **Note**: hivemind does NOT use `process_openapi.py` or `tests_api`
+
+## Testing the Migration
+
+1. Create branch in common-ci-configuration with haf-app-tools
+2. Update one app (e.g., balance_tracker) to use the new approach
+3. Run full CI pipeline to verify:
+   - Scripts fetch correctly
+   - PostgreSQL role creation works
+   - OpenAPI processing works
+   - Tests pass
+4. Once verified, migrate remaining apps
+
+## Rollback Plan
+
+If issues occur, apps can temporarily:
+1. Re-add haf submodule
+2. Revert CI changes
+
+The haf repository will retain the original scripts during the transition period.
+
+## Timeline
+
+1. **Phase 1**: Add haf-app-tools to common-ci-configuration
+2. **Phase 2**: Migrate balance_tracker as pilot
+3. **Phase 3**: Migrate remaining apps (hafah, haf_block_explorer, hivemind)
+4. **Phase 4**: (Optional) Deprecate scripts in haf/scripts/ with redirect notice
diff --git a/haf-app-tools/README.md b/haf-app-tools/README.md
new file mode 100644
index 0000000..9db5d5a
--- /dev/null
+++ b/haf-app-tools/README.md
@@ -0,0 +1,72 @@
+# HAF App Tools
+
+Shared utilities for HAF (Hive Application Framework) applications.
+
+These tools were previously located in the `haf` repository and required apps to include `haf` as a submodule. By moving them here, apps can fetch them at runtime without the heavy submodule dependency.
+
+## Contents
+
+### Scripts (`scripts/`)
+
+| Script | Purpose |
+|--------|---------|
+| `common.sh` | Shared bash utilities: `log_exec_params`, `do_clone_commit`, `do_clone_branch` |
+| `create_haf_app_role.sh` | Creates HAF application PostgreSQL roles with proper group membership |
+| `copy_datadir.sh` | Copies HAF data directories with NFS cache fallback and symlink fixing |
+
+### Python (`python/`)
+
+| File | Purpose |
+|------|---------|
+| `process_openapi.py` | Generates SQL types/functions and nginx rewrite rules from OpenAPI YAML embedded in SQL comments |
+| `requirements.txt` | Python dependencies for `process_openapi.py` |
+
+### Config (`config/`)
+
+| File | Purpose |
+|------|---------|
+| `config_5M.ini` | Standard hived configuration for 5M block replay testing |
+
+## Usage
+
+### Fetching at Runtime (Recommended for CI)
+
+```yaml
+variables:
+  COMMON_CI_REF: "develop"  # or pin to specific commit
+  COMMON_CI_URL: "https://gitlab.syncad.com/hive/common-ci-configuration/-/raw/${COMMON_CI_REF}"
+
+.fetch_haf_app_tools:
+  before_script:
+    - mkdir -p /tmp/haf-app-tools
+    - curl -fsSL "${COMMON_CI_URL}/haf-app-tools/scripts/common.sh" -o /tmp/haf-app-tools/common.sh
+    - curl -fsSL "${COMMON_CI_URL}/haf-app-tools/scripts/create_haf_app_role.sh" -o /tmp/haf-app-tools/create_haf_app_role.sh
+    - curl -fsSL "${COMMON_CI_URL}/haf-app-tools/scripts/copy_datadir.sh" -o /tmp/haf-app-tools/copy_datadir.sh
+    - curl -fsSL "${COMMON_CI_URL}/haf-app-tools/python/process_openapi.py" -o /tmp/haf-app-tools/process_openapi.py
+    - curl -fsSL "${COMMON_CI_URL}/haf-app-tools/config/config_5M.ini" -o /tmp/haf-app-tools/config_5M.ini
+    - chmod +x /tmp/haf-app-tools/*.sh
+```
+
+### Using Scripts
+
+```bash
+# Create HAF app role
+/tmp/haf-app-tools/create_haf_app_role.sh \
+    --postgres-url="postgresql://haf_admin@localhost/haf_block_log" \
+    --haf-app-account="myapp"
+
+# Process OpenAPI
+pip install -r /tmp/haf-app-tools/requirements.txt
+python3 /tmp/haf-app-tools/process_openapi.py output_dir/ endpoints/*.sql
+```
+
+## Migration from haf Submodule
+
+See [docs/haf-app-tools-migration.md](../docs/haf-app-tools-migration.md) for detailed migration instructions.
+
+## Apps Using These Tools
+
+- **balance_tracker** - Uses `process_openapi.py`, `config_5M.ini`
+- **haf_block_explorer** - Uses `process_openapi.py`, `config_5M.ini`
+- **hafah** - Uses `process_openapi.py`, `common.sh`, `create_haf_app_role.sh`, `config_5M.ini`
+- **hivemind** - Uses `common.sh`, `create_haf_app_role.sh`, `copy_datadir.sh`, `config_5M.ini`
diff --git a/haf-app-tools/config/config_5M.ini b/haf-app-tools/config/config_5M.ini
new file mode 100644
index 0000000..e7eb917
--- /dev/null
+++ b/haf-app-tools/config/config_5M.ini
@@ -0,0 +1,41 @@
+
+log-appender = {"appender":"stderr","stream":"std_error"}
+log-logger = {"name":"default","level":"info","appender":"stderr"}
+
+backtrace = yes
+
+plugin = webserver p2p json_rpc
+plugin = database_api
+plugin = condenser_api
+plugin = block_api 
+#plugin = witness
+#plugin = rc
+
+# market_history enabled per abw request
+plugin = market_history
+plugin = market_history_api
+
+plugin = state_snapshot
+
+# sql serializer
+plugin = sql_serializer
+# Actual database URL is passed to hived commandline by docker_entrypoint.sh
+#psql-url = postgresql://user:pass@localhost:5432/haf_block_log
+psql-index-threshold = 1000000
+
+# Actual SHM file directory is passed to hived commandline by docker_entrypoint.sh
+#shared-file-dir = "/run/hive"
+shared-file-size = 1G
+shared-file-full-threshold = 9500
+shared-file-scale-rate = 1000
+
+flush-state-interval = 0
+
+market-history-bucket-size = [15,60,300,3600,86400]
+market-history-buckets-per-size = 5760
+
+transaction-status-block-depth = 64000
+transaction-status-track-after-block = 42000000
+
+webserver-thread-pool-size = 8
+
diff --git a/haf-app-tools/python/process_openapi.py b/haf-app-tools/python/process_openapi.py
new file mode 100644
index 0000000..2365258
--- /dev/null
+++ b/haf-app-tools/python/process_openapi.py
@@ -0,0 +1,571 @@
+#! /usr/bin/env python
+import yaml
+import json
+import re
+import sys
+import argparse
+
+from pathlib import Path
+
+from deepmerge import Merger
+from jsonpointer import resolve_pointer
+
+# Create a custom merger that replaces lists instead of appending them
+# This prevents duplication of enum values, oneOf arrays, etc.
+openapi_merger = Merger(
+    # List strategy: replace the old list with the new one
+    [(list, ["override"]),
+     (dict, ["merge"]),
+     (set, ["union"])],
+    # Fallback strategies
+    ["override"],
+    # Type conflict strategies
+    ["override"]
+)
+
+collected_openapi_fragments = {}
+all_openapi_fragments = {}  # Includes internal endpoints for rewrite rules
+
+def merge_openapi_fragment(new_fragment):
+    global collected_openapi_fragments, all_openapi_fragments
+    # sys.stdout.write('Before:')
+    # sys.stdout.write(yaml.dump(collected_openapi_fragments, Dumper=yaml.Dumper))
+    
+    # Always merge to all_openapi_fragments for rewrite rules
+    all_openapi_fragments = openapi_merger.merge(all_openapi_fragments, new_fragment)
+    
+    # Check if this is a path fragment with x-internal flag
+    filtered_fragment = new_fragment.copy()
+    if 'paths' in filtered_fragment:
+        filtered_paths = {}
+        for path, methods in filtered_fragment['paths'].items():
+            filtered_methods = {}
+            for method, method_data in methods.items():
+                # Skip internal endpoints in the public API spec
+                if not method_data.get('x-internal', False):
+                    filtered_methods[method] = method_data
+            if filtered_methods:
+                filtered_paths[path] = filtered_methods
+        if filtered_paths:
+            filtered_fragment['paths'] = filtered_paths
+        else:
+            # If all paths were internal, don't add the fragment at all
+            if len(filtered_fragment) == 1:  # Only 'paths' key
+                return
+            else:
+                del filtered_fragment['paths']
+    
+    collected_openapi_fragments = openapi_merger.merge(collected_openapi_fragments, filtered_fragment)
+    # sys.stdout.write('After:')
+    # sys.stdout.write(yaml.dump(collected_openapi_fragments, Dumper=yaml.Dumper))
+
+def generate_code_for_enum_openapi_fragment(enum_name, enum_values_openapi_fragment, sql_output):
+    sql_output.write('-- openapi-generated-code-begin\n')
+    sql_output.write(f'DROP TYPE IF EXISTS {enum_name} CASCADE;\n')
+    sql_output.write(f'CREATE TYPE {enum_name} AS ENUM (\n')
+    sql_output.write(',\n'.join([f'    \'{enum_value}\'' for enum_value in enum_values_openapi_fragment]))
+    sql_output.write('\n);\n')
+    sql_output.write('-- openapi-generated-code-end\n')
+
+def generate_type_string_from_schema(schema):
+    if 'x-sql-datatype' in schema:
+        return schema['x-sql-datatype']
+    elif '$ref' in schema:
+        reference = schema['$ref']
+        # openapi references typically start with #, but that's not a valid json pointer
+        if len(reference) > 0 and reference[0] == '#':
+            reference = reference[1:]
+        referent = resolve_pointer(collected_openapi_fragments, reference)
+        if 'type' in referent and referent['type'] == 'array':
+            # special case, if it's an array, we don't use the name of the type, we say SETOF the contained type
+            if 'items' in referent:
+                return 'SETOF ' + generate_type_string_from_schema(referent['items'])
+        else:
+            return reference.split('/')[-1]
+    elif 'type' in schema:
+        schema_type = schema['type']
+        if schema_type == 'integer':
+            return 'INT'
+        elif schema_type == 'string':
+            if 'format' in schema:
+                if schema['format'] == 'date-time':
+                    return 'TIMESTAMP'
+            return 'TEXT'
+        elif schema_type == 'array':
+            items = schema['items']
+            if '$ref' in items:
+                reference = items['$ref']
+                # openapi references typically start with #, but that's not a valid json pointer
+                if len(reference) > 0 and reference[0] == '#':
+                    reference = reference[1:]
+                referent = resolve_pointer(collected_openapi_fragments, reference)
+                return reference.split('/')[-1] + '[]'
+            if 'type' in items:
+                return generate_type_string_from_schema(items) + '[]'
+        elif schema_type == 'boolean':
+            return 'BOOLEAN'
+        elif schema_type == 'number':
+            return 'FLOAT'
+        elif schema_type == 'object':
+            return 'object'
+        else:
+            assert(False)
+
+def generate_default_value_string_from_schema(schema):
+    if 'default' in schema:
+        default_value = str(schema['default'])
+
+        requires_quoting = True
+        if default_value  == str(None):
+            default_value = 'NULL'
+            requires_quoting = False
+        elif 'type' in schema:
+            schema_type = schema['type']
+            if schema_type == 'integer' or schema_type == 'number' or  schema_type == 'boolean':
+                requires_quoting = False
+        elif 'x-sql-datatype' in schema:
+            sql_datatype = schema['x-sql-datatype']
+            if sql_datatype.upper() == 'INT' or sql_datatype.upper() == 'FLOAT':
+                requires_quoting = False
+        elif '$ref' in schema:
+            reference = schema['$ref']
+            if len(reference) > 0 and reference[0] == '#':
+                reference = reference[1:]
+            referent = resolve_pointer(collected_openapi_fragments, reference)
+        if requires_quoting:
+            default_value = f'\'{default_value}\''
+    elif 'x-sql-default-value' in schema:
+        # it's assumed that x-sql-default-value is already properly quoted
+        # that enables you to use either a string like '2000-01-01' or an expression like NOW()
+        default_value = str(schema['x-sql-default-value'])
+    else:
+        return ''
+
+    return ' = ' + default_value
+
+
+def generate_type_field_or_parameter_string_from_openapi_fragment(property_name, property_properties, include_default_values = False):
+    type_string = generate_type_string_from_schema(property_properties)
+    type_field_string = f'    "{property_name}" {type_string}'
+    if include_default_values:
+        type_field_string += generate_default_value_string_from_schema(property_properties)
+    return type_field_string
+
+def generate_code_for_object_openapi_fragment(object_name, object_openapi_fragment, sql_output):
+    sql_output.write('-- openapi-generated-code-begin\n')
+    sql_output.write(f'DROP TYPE IF EXISTS {object_name} CASCADE;\n')
+    sql_output.write(f'CREATE TYPE {object_name} AS (\n')
+    sql_output.write(',\n'.join([generate_type_field_or_parameter_string_from_openapi_fragment(property_name, property_properties) for property_name, property_properties in object_openapi_fragment.items()]))
+    sql_output.write('\n);\n')
+    sql_output.write('-- openapi-generated-code-end\n')
+
+def generate_function_signature(method, method_fragment, sql_output):
+    def generate_parameter_string(parameter_openapi_fragment, include_default_values):
+        name = parameter_openapi_fragment['name']
+        schema = parameter_openapi_fragment['schema']
+        return generate_type_field_or_parameter_string_from_openapi_fragment(name, schema, include_default_values = include_default_values)
+
+    def generate_parameter_list(parameters_openapi_fragment, include_default_values):
+        return ',\n'.join([generate_parameter_string(param, include_default_values) for param in parameters_openapi_fragment])
+
+    assert('operationId' in method_fragment)
+    operationId = method_fragment['operationId']
+
+
+    assert('responses' in method_fragment)
+    responses = method_fragment['responses']
+    assert('200' in responses)
+    ok_response = responses['200']
+    assert('content' in ok_response)
+    content = ok_response['content']
+    assert('application/json' in content)
+    json_response = content['application/json']
+    assert('schema' in json_response)
+    response_schema = json_response['schema']
+    response_type_string = generate_type_string_from_schema(response_schema)
+
+
+    sql_output.write('-- openapi-generated-code-begin\n')
+    if 'parameters' not in method_fragment or method_fragment['parameters'] == None:
+        sql_output.write(f'DROP FUNCTION IF EXISTS {operationId};\n')
+        sql_output.write(f'CREATE OR REPLACE FUNCTION {operationId}()\n')
+        sql_output.write(f'RETURNS {response_type_string} \n')
+        sql_output.write('-- openapi-generated-code-end\n')
+    else:
+        parameters_openapi_fragment = method_fragment['parameters']
+        #parameters = generate_parameter_list(parameters_openapi_fragment)
+        #(\n{generate_parameter_list(parameters_openapi_fragment, False)}\n) 
+        #- removed because on runtime code upgrade the functions that might have parameter changes won't be dropped due to parameters not matching
+        sql_output.write(f'DROP FUNCTION IF EXISTS {operationId};\n')
+        sql_output.write(f'CREATE OR REPLACE FUNCTION {operationId}(\n{generate_parameter_list(parameters_openapi_fragment, True)}\n)\n')
+        sql_output.write(f'RETURNS {response_type_string} \n')
+        sql_output.write('-- openapi-generated-code-end\n')
+
+def generate_code_from_openapi_fragment(openapi_fragment, sql_output):
+    # figure out what type of fragment this is so we know what to generate
+    if len(openapi_fragment) == 1:
+        key = next(iter(openapi_fragment))
+        if key == 'components':
+            components = openapi_fragment[key]
+            assert(len(components) == 1)
+            assert('schemas' in components)
+            schemas = components['schemas']
+            assert(len(schemas) == 1)
+            schema_name = next(iter(schemas))
+            schema = schemas[schema_name]
+            assert('type' in schema)
+            if schema['type'] == 'string' and 'enum' in schema:
+                generate_code_for_enum_openapi_fragment(schema_name, schema['enum'], sql_output)
+            elif schema['type'] == 'object' and 'properties' and 'x-sql-datatype' in schema:
+                pass
+            elif schema['type'] == 'object' and 'properties' in schema:
+                generate_code_for_object_openapi_fragment(schema_name, schema['properties'], sql_output)
+            elif schema['type'] == 'array':
+                # don't generate code for arrays.  when these are returned, the generated SQL
+                # uses SETOF underlying_data_type
+                pass
+            else:
+                assert(False)
+        elif key == 'paths':
+            paths = openapi_fragment[key]
+            assert(len(paths) == 1)
+            path = next(iter(paths))
+            methods = paths[path]
+            assert(len(methods) == 1)
+            method = next(iter(methods))
+            method_fragment = methods[method]
+            generate_function_signature(method, method_fragment, sql_output)
+        else:
+            # we don't know how to generate code for this fragment, assume it's just a fragment we pass through
+            pass
+    else:
+        # we don't know how to generate code for this fragment, assume it's just a fragment
+        pass
+
+# return true if this is a PostgreSQL keyword.  List taken from https://www.postgresql.org/docs/current/sql-keywords-appendix.html
+# excluding all keywords which are marked 'reserved', or 'non-reserved' but with qualifications
+def is_sql_keyword(word):
+    keywords = {'BETWEEN',
+                'BIGINT',
+                'BIT',
+                'BOOLEAN',
+                'COALESCE',
+                'DEC',
+                'DECIMAL',
+                'EXISTS',
+                'EXTRACT',
+                'FLOAT',
+                'GREATEST',
+                'GROUPING',
+                'INOUT',
+                'INT',
+                'INTEGER',
+                'INTERVAL',
+                'JSON_ARRAY',
+                'JSON_ARRAYAGG',
+                'JSON_OBJECT',
+                'JSON_OBJECTAGG',
+                'LEAST',
+                'NATIONAL',
+                'NCHAR',
+                'NONE',
+                'NORMALIZE',
+                'NULLIF',
+                'NUMERIC',
+                'OUT',
+                'OVERLAY',
+                'POSITION',
+                'REAL',
+                'ROW',
+                'SETOF',
+                'SMALLINT',
+                'SUBSTRING',
+                'TIME',
+                'TIMESTAMP',
+                'TREAT',
+                'TRIM',
+                'VALUES',
+                'VARCHAR',
+                'XMLATTRIBUTES',
+                'XMLCONCAT',
+                'XMLELEMENT',
+                'XMLEXISTS',
+                'XMLFOREST',
+                'XMLNAMESPACES',
+                'XMLPARSE',
+                'XMLPI',
+                'XMLROOT',
+                'XMLSERIALIZE',
+                'XMLTABLE',
+                'CHAR',
+                'CHARACTER',
+                'PRECISION',
+                'DAY',
+                'FILTER',
+                'HOUR',
+                'MINUTE',
+                'MONTH',
+                'OVER',
+                'SECOND',
+                'VARYING',
+                'WITHIN',
+                'WITHOUT',
+                'YEAR',
+                'ALL',
+                'ANALYSE',
+                'ANALYZE',
+                'AND',
+                'ANY',
+                'ASC',
+                'ASYMMETRIC',
+                'BOTH',
+                'CASE',
+                'CAST',
+                'CHECK',
+                'COLLATE',
+                'COLUMN',
+                'CONSTRAINT',
+                'CURRENT_CATALOG',
+                'CURRENT_DATE',
+                'CURRENT_ROLE',
+                'CURRENT_TIME',
+                'CURRENT_TIMESTAMP',
+                'CURRENT_USER',
+                'DEFAULT',
+                'DEFERRABLE',
+                'DESC',
+                'DISTINCT',
+                'DO',
+                'ELSE',
+                'END',
+                'FALSE',
+                'FOREIGN',
+                'IN',
+                'INITIALLY',
+                'LATERAL',
+                'LEADING',
+                'LOCALTIME',
+                'LOCALTIMESTAMP',
+                'NOT',
+                'NULL',
+                'ONLY',
+                'OR',
+                'PLACING',
+                'PRIMARY',
+                'REFERENCES',
+                'SELECT',
+                'SESSION_USER',
+                'SOME',
+                'SYMMETRIC',
+                'SYSTEM_USER',
+                'TABLE',
+                'THEN',
+                'TRAILING',
+                'TRUE',
+                'UNIQUE',
+                'USER',
+                'USING',
+                'VARIADIC',
+                'WHEN',
+                'AUTHORIZATION',
+                'BINARY',
+                'COLLATION',
+                'CONCURRENTLY',
+                'CROSS',
+                'CURRENT_SCHEMA',
+                'FREEZE',
+                'FULL',
+                'ILIKE',
+                'INNER',
+                'IS',
+                'JOIN',
+                'LEFT',
+                'LIKE',
+                'NATURAL',
+                'OUTER',
+                'RIGHT',
+                'SIMILAR',
+                'TABLESAMPLE',
+                'VERBOSE',
+                'ISNULL',
+                'NOTNULL',
+                'OVERLAPS',
+                'ARRAY',
+                'AS',
+                'CREATE',
+                'EXCEPT',
+                'FETCH',
+                'FOR',
+                'FROM',
+                'GRANT',
+                'GROUP',
+                'HAVING',
+                'INTERSECT',
+                'INTO',
+                'LIMIT',
+                'OFFSET',
+                'ON',
+                'ORDER',
+                'RETURNING',
+                'TO',
+                'UNION',
+                'WHERE',
+                'WINDOW',
+                'WITH'}
+    return word.upper() in keywords
+
+
+def dump_openapi_spec(sql_output):
+    sql_output.write('-- openapi-generated-code-begin\n')
+    sql_output.write('  openapi json = $$\n')
+    sql_output.write(json.dumps(collected_openapi_fragments, indent = 2))
+    sql_output.write('\n$$;\n')
+    sql_output.write('-- openapi-generated-code-end\n')
+
+def generate_rewrite_rules(rewrite_rules_file, use_home_rewrite=False):
+    # Use all_openapi_fragments for rewrite rules (includes internal endpoints)
+    if 'paths' in all_openapi_fragments:
+        with open(rewrite_rules_file, 'w') as rewrite_rules_file:
+            # generate default rules that are always the same
+
+            if use_home_rewrite:
+                rewrite_rules_file.write(f'# endpoint for json-rpc 2.0\n')
+                rewrite_rules_file.write(f'rewrite ^/(.*)$ /rpc/home break;\n\n')
+            else:
+                rewrite_rules_file.write(f'# default endpoint for everything else\n')
+                rewrite_rules_file.write(f'rewrite ^/(.*)$ /rpc/$1 break;\n\n')
+
+            if not use_home_rewrite:
+                rewrite_rules_file.write(f'# endpoint for openapi spec itself\n')
+                rewrite_rules_file.write(f'rewrite ^/$ / break;\n\n')
+
+            for path, methods_for_path in all_openapi_fragments['paths'].items():
+                for method, method_data in methods_for_path.items():
+                    path_parts = path.split('/')
+                    # paths in openapi spec will start with / and then the name of the API, like: GET /hafbe/witnesses
+                    # an upstream server will remove the name of the API, so we get rid of it here:
+                    if len(path_parts) > 1:
+                        path_parts = path_parts[1:]
+                    rewrite_parts = ['^']
+                    query_parts = []
+                    next_placeholder = 1
+                    rpc_method_name = method_data['operationId'].split('.')[-1]
+                    path_filter_present = False  # Track if `path_filter` is a parameter
+
+                    # Check for parameters
+                    if 'parameters' in method_data:
+                        for param in method_data['parameters']:
+                            if param['name'] == 'path-filter':
+                                path_filter_present = True
+
+                    for path_part in path_parts:
+                        assert(len(path_part) > 0)
+                        if path_part[0] == '{' and path_part[-1] == '}':
+                            rewrite_parts.append('([^/]+)')
+                            param_name = path_part[1:-1]
+                            query_parts.append(f'{param_name}=${next_placeholder}')
+                            next_placeholder += 1
+                        else:
+                            rewrite_parts.append(path_part)
+
+                    rewrite_from = '/'.join(rewrite_parts)
+                    # Construct the query string
+                    query_string = '?' + '&'.join(query_parts) if query_parts else ''
+
+                    rewrite_to = f'/rpc/{rpc_method_name}{query_string}'
+                    
+                    # Add the path_filter parameter if present
+                    if path_filter_present:
+                        if query_string:  # If we have existing query params
+                            rewrite_to += '&path-filter=$path_filters'
+                        else:  # No existing query params
+                            rewrite_to += '?path-filter=$path_filters'
+
+                    # Add comment indicating if endpoint is internal
+                    internal_comment = ' (internal)' if method_data.get('x-internal', False) else ''
+                    rewrite_rules_file.write(f'# endpoint for {method} {path}{internal_comment}\n')
+                    rewrite_rules_file.write(f'rewrite {rewrite_from} {rewrite_to} break;\n\n')
+
+def process_sql_file(sql_input, sql_output):
+    yaml_comment_path = []
+    yaml_comment_lines = []
+    in_yaml_comment = False
+    in_generated_code = False
+
+    def finish_comment():
+        nonlocal yaml_comment_lines
+        nonlocal yaml_comment_path
+        comment_yaml = yaml.load(''.join(yaml_comment_lines), Loader=yaml.FullLoader)
+        for path_element in reversed(yaml_comment_path):
+            comment_yaml = {path_element: comment_yaml}
+        if sql_output != None:
+            generate_code_from_openapi_fragment(comment_yaml, sql_output)
+        else:
+            merge_openapi_fragment(comment_yaml)
+        #print(comment_yaml)
+        #sys.stdout.write(yaml.dump(comment_yaml, Dumper=yaml.Dumper))
+        yaml_comment_lines = []
+        yaml_comment_path = []
+
+    for line in sql_input:
+        if in_yaml_comment:
+            if sql_output != None:
+                sql_output.write(line)
+            if re.match(r'^\s*\*\/\s*$', line):
+                in_yaml_comment = False
+                finish_comment()
+                continue
+            else:
+                yaml_comment_lines.append(line)
+        elif in_generated_code:
+            if line == '-- openapi-generated-code-end\n':
+                in_generated_code = False
+                continue
+        else:
+            if line == '-- openapi-generated-code-begin\n':
+                in_generated_code = True
+                continue
+            if sql_output != None:
+                sql_output.write(line)
+
+                matches_openapi_spec_comment = re.match(r'^\s*-- openapi-spec\s*$', line)
+                if matches_openapi_spec_comment:
+                    dump_openapi_spec(sql_output)
+
+            matches_openapi_fragment = re.match(r'^\s*\/\*\*\s*openapi(?::((?:\w+)(?::\w+)*))?\s*$', line)
+            if matches_openapi_fragment:
+                if matches_openapi_fragment.group(1):
+                    yaml_comment_path = matches_openapi_fragment.group(1).split(':')
+                else:
+                    yaml_comment_path = []
+                in_yaml_comment = True
+
+def process_sql_files(input_sql_filenames, output_dir = None):
+    for input_sql_filename in input_sql_filenames:
+        with open(input_sql_filename) as sql_input:
+            if output_dir == None:
+                process_sql_file(sql_input, None)
+            else:
+                output_sql_filename = output_dir / Path(input_sql_filename)
+                output_sql_filename.parent.mkdir(parents = True, exist_ok = True)
+                with output_sql_filename.open(mode = 'w') as sql_output:
+                    process_sql_file(sql_input, sql_output)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("output_dir", type=Path)
+    parser.add_argument("input_files", nargs='+')
+    parser.add_argument("--home-rewrite", action="store_true", help="Use /rpc/home rewrite rule")
+    args = parser.parse_args()
+
+    output_dir = args.output_dir
+    input_files = args.input_files
+    rewrite_rules_file = 'rewrite_rules.conf'
+    use_home_rewrite = args.home_rewrite
+
+    # Do a first pass that just collects all the openapi fragments
+    process_sql_files(input_files)
+    # Then a second pass that does the substitutions, writing output files to `output_dir`
+    process_sql_files(input_files, output_dir)
+    # and dump the nginx rewrite rules
+    generate_rewrite_rules(rewrite_rules_file, use_home_rewrite)
diff --git a/haf-app-tools/python/requirements.txt b/haf-app-tools/python/requirements.txt
new file mode 100644
index 0000000..d93befe
--- /dev/null
+++ b/haf-app-tools/python/requirements.txt
@@ -0,0 +1,4 @@
+# Requirements for process_openapi.py
+deepmerge>=1.1.0
+jsonpointer>=2.3
+pyyaml>=6.0
diff --git a/haf-app-tools/scripts/common.sh b/haf-app-tools/scripts/common.sh
new file mode 100644
index 0000000..780d08c
--- /dev/null
+++ b/haf-app-tools/scripts/common.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+#
+# Common utility functions for HAF applications
+# Fetched from: common-ci-configuration/haf-app-tools/scripts/common.sh
+#
+
+set -euo pipefail
+
+# Only setup logging if LOG_FILE is defined
+if [[ -n "${LOG_FILE:-}" ]]; then
+  exec > >(tee -i "${LOG_FILE}") 2>&1
+fi
+
+log_exec_params() {
+  echo
+  echo -n "$0 parameters: "
+  for arg in "$@"; do echo -n "$arg "; done
+  echo
+}
+
+do_clone_commit() {
+  local commit="$1"
+  local src_dir=$2
+  local repo_url=$3
+
+  echo "Cloning commit: $commit from $repo_url into: $src_dir ..."
+  mkdir -p "$src_dir"
+  pushd "$src_dir"
+
+  git init
+  git remote add origin "$repo_url"
+  git fetch --depth 1 origin "$commit"
+  git checkout FETCH_HEAD
+
+  # Check if hive submodule needs special handling (feature branch)
+  if [[ -f .gitmodules ]] && grep -q "branch = feature/" .gitmodules; then
+    HIVE_BRANCH=$(git config -f .gitmodules submodule.hive.branch 2>/dev/null || echo "")
+    if [[ -n "$HIVE_BRANCH" ]]; then
+      echo "Initializing hive submodule from feature branch: $HIVE_BRANCH"
+      HIVE_COMMIT=$(git ls-tree HEAD hive | awk '{print $3}')
+      HIVE_URL=$(git config -f .gitmodules submodule.hive.url)
+      # Convert relative URL to absolute if needed
+      if [[ "$HIVE_URL" == ../* ]]; then
+        HIVE_URL="https://gitlab.syncad.com/hive/hive.git"
+      fi
+      # Clone the hive submodule, then fetch feature branch and checkout commit
+      rm -rf hive
+      git clone --no-checkout "$HIVE_URL" hive
+      pushd hive
+      git fetch origin "$HIVE_BRANCH" --depth=1
+      git fetch --depth=1 origin "$HIVE_COMMIT" || true
+      git checkout "$HIVE_COMMIT"
+      popd
+      # Now update remaining submodules recursively
+      git submodule update --init --recursive
+    else
+      git submodule update --init --recursive
+    fi
+  else
+    git submodule update --init --recursive
+  fi
+
+  popd
+}
+
+do_clone_branch() {
+  local branch=$1
+  local src_dir="$2"
+  local repo_url="$3"
+  echo "Cloning branch: $branch from $repo_url ..."
+  git clone --recurse-submodules --shallow-submodules --single-branch --depth=1 --branch "$branch" -- "$repo_url" "$src_dir"
+}
+
+
+do_clone() {
+  local branch=$1
+  local src_dir="$2"
+  local repo_url="$3"
+  local commit="$4"
+
+  if [[ "$commit" != "" ]]; then
+    do_clone_commit "$commit" "$src_dir" "$repo_url"
+  else
+    do_clone_branch "$branch" "$src_dir" "$repo_url"
+  fi
+}
diff --git a/haf-app-tools/scripts/copy_datadir.sh b/haf-app-tools/scripts/copy_datadir.sh
new file mode 100644
index 0000000..8263383
--- /dev/null
+++ b/haf-app-tools/scripts/copy_datadir.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+#
+# Copies HAF data directory with NFS cache fallback
+# Fetched from: common-ci-configuration/haf-app-tools/scripts/copy_datadir.sh
+#
+
+set -xeuo pipefail
+
+# Default shared block_log location (used when blockchain not in cache)
+# Try NFS cache first, fall back to local cache
+SHARED_BLOCK_LOG_DIR="${SHARED_BLOCK_LOG_DIR:-/nfs/ci-cache/hive/block_log_5m}"
+
+# NFS cache configuration
+CACHE_NFS_PATH="${CACHE_NFS_PATH:-/nfs/ci-cache}"
+
+# Cache manager script - fetch from common-ci-configuration if not available locally
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+COMMON_CI_URL="${COMMON_CI_URL:-https://gitlab.syncad.com/hive/common-ci-configuration/-/raw/develop}"
+
+# Try local paths first, then fetch
+CACHE_MANAGER=""
+for path in "$SCRIPT_DIR/cache-manager.sh" "$SCRIPT_DIR/../cache-manager.sh" "/tmp/cache-manager.sh"; do
+    if [[ -x "$path" ]]; then
+        CACHE_MANAGER="$path"
+        break
+    fi
+done
+
+if [[ -z "$CACHE_MANAGER" ]]; then
+    CACHE_MANAGER="/tmp/cache-manager.sh"
+    curl -fsSL "${COMMON_CI_URL}/scripts/cache-manager.sh" -o "$CACHE_MANAGER" 2>/dev/null || true
+    chmod +x "$CACHE_MANAGER" 2>/dev/null || true
+fi
+
+# Fix pg_tblspc symlinks to point to the correct tablespace location
+# PostgreSQL stores tablespace symlinks with absolute paths, which break when data is copied
+# Uses relative symlinks so they work both on the host AND inside Docker containers
+fix_pg_tblspc_symlinks() {
+    local datadir="$1"
+    local pg_tblspc="${datadir}/haf_db_store/pgdata/pg_tblspc"
+    local tablespace="${datadir}/haf_db_store/tablespace"
+
+    if [[ ! -d "$pg_tblspc" ]]; then
+        return 0
+    fi
+
+    for link in "$pg_tblspc"/*; do
+        if [[ -L "$link" ]]; then
+            local target
+            target=$(readlink "$link")
+            # Fix if symlink contains 'tablespace' (relative or wrong absolute path)
+            # Use relative path so it works inside Docker containers too
+            if [[ "$target" == *"tablespace"* ]] && [[ -d "$tablespace" ]]; then
+                # Relative path from pg_tblspc/16396 to tablespace is ../../tablespace
+                local relative_path="../../tablespace"
+                echo "Fixing pg_tblspc symlink: $(basename "$link") -> $relative_path"
+                sudo rm -f "$link"
+                sudo ln -s "$relative_path" "$link"
+            fi
+        fi
+    done
+}
+
+# Function to extract NFS cache if local DATA_SOURCE doesn't exist
+# Delegates to cache-manager.sh for unified cache handling
+# Derives cache type and key from DATA_SOURCE path pattern: /cache/{type}_{key}
+extract_nfs_cache_if_needed() {
+    local data_source="$1"
+
+    # Skip if already exists
+    if [[ -d "${data_source}/datadir" ]]; then
+        echo "Local cache exists at ${data_source}/datadir"
+        return 0
+    fi
+
+    # Parse DATA_SOURCE to derive cache type and key
+    # Pattern: /cache/{type}_{key} -> cache-manager get {type} {key} {data_source}
+    local basename
+    basename=$(basename "$data_source")
+
+    # Split by first underscore: haf_pipeline_12345_filtered -> type=haf_pipeline, key=12345_filtered
+    # Most common: hive_{commit} -> type=hive, key={commit}
+    local cache_type cache_key
+    if [[ "$basename" =~ ^([^_]+_[^_]+)_(.+)$ ]]; then
+        cache_type="${BASH_REMATCH[1]}"
+        cache_key="${BASH_REMATCH[2]}"
+    elif [[ "$basename" =~ ^([^_]+)_(.+)$ ]]; then
+        cache_type="${BASH_REMATCH[1]}"
+        cache_key="${BASH_REMATCH[2]}"
+    else
+        echo "Cannot parse DATA_SOURCE path for NFS fallback: $data_source"
+        return 1
+    fi
+
+    echo "Attempting NFS cache retrieval: type=$cache_type key=$cache_key"
+
+    # Use cache-manager if available, otherwise fall back to direct NFS access
+    if [[ -x "$CACHE_MANAGER" ]]; then
+        echo "Using cache-manager for NFS fallback"
+        if "$CACHE_MANAGER" get "$cache_type" "$cache_key" "$data_source"; then
+            echo "Cache-manager retrieved cache successfully"
+            # Fix pg_tblspc symlinks after extraction (cache-manager handles pgdata perms)
+            fix_pg_tblspc_symlinks "${data_source}/datadir"
+            return 0
+        else
+            echo "Cache-manager could not retrieve cache"
+            return 1
+        fi
+    else
+        # Fallback: direct NFS tar extraction (for environments without cache-manager)
+        local nfs_tar="${CACHE_NFS_PATH}/${cache_type}/${cache_key}.tar"
+        echo "Cache-manager not found, checking NFS directly: $nfs_tar"
+
+        if [[ -f "$nfs_tar" ]]; then
+            echo "Found NFS cache, extracting to $data_source"
+            mkdir -p "$data_source"
+            chmod 777 "$data_source" 2>/dev/null || true
+
+            if tar xf "$nfs_tar" -C "$data_source"; then
+                echo "NFS cache extracted successfully"
+
+                # Restore pgdata permissions for PostgreSQL
+                local pgdata="${data_source}/datadir/haf_db_store/pgdata"
+                local tablespace="${data_source}/datadir/haf_db_store/tablespace"
+                if [[ -d "$pgdata" ]]; then
+                    chmod 700 "$pgdata" 2>/dev/null || true
+                    chown -R 105:105 "$pgdata" 2>/dev/null || true
+                fi
+                if [[ -d "$tablespace" ]]; then
+                    chmod 700 "$tablespace" 2>/dev/null || true
+                    chown -R 105:105 "$tablespace" 2>/dev/null || true
+                fi
+                # Fix pg_tblspc symlinks after extraction
+                fix_pg_tblspc_symlinks "${data_source}/datadir"
+                return 0
+            else
+                echo "ERROR: Failed to extract NFS cache"
+                return 1
+            fi
+        else
+            echo "NFS cache not found at $nfs_tar"
+            return 1
+        fi
+    fi
+}
+
+if [ -n "${DATA_SOURCE+x}" ]
+then
+    echo "DATA_SOURCE: ${DATA_SOURCE}"
+    echo "DATADIR: ${DATADIR}"
+
+    # Try NFS fallback if local DATA_SOURCE doesn't exist
+    if [[ ! -d "${DATA_SOURCE}/datadir" ]]; then
+        echo "Local DATA_SOURCE not found, attempting NFS fallback..."
+        extract_nfs_cache_if_needed "${DATA_SOURCE}" || true
+    fi
+
+    if [ "$(realpath "${DATA_SOURCE}/datadir")" != "$(realpath "${DATADIR}")" ]
+    then
+        echo "Creating copy of ${DATA_SOURCE}/datadir inside ${DATADIR}"
+        sudo -Enu hived mkdir -p "${DATADIR}"
+        # Use cp without -p to avoid "Operation not supported" errors when copying from NFS
+        flock "${DATA_SOURCE}/datadir" sudo -En cp -r --no-preserve=mode,ownership "${DATA_SOURCE}/datadir"/*  "${DATADIR}"
+
+        # Fix pg_tblspc symlinks after copying to DATADIR
+        fix_pg_tblspc_symlinks "${DATADIR}"
+
+        # Handle blockchain directory - may be excluded from cache for efficiency
+        # Check if directory exists AND has block_log files (empty dirs can be created by Docker bind mounts)
+        if [[ -d "${DATA_SOURCE}/datadir/blockchain" ]] && ls "${DATA_SOURCE}/datadir/blockchain"/block_log* 1>/dev/null 2>&1; then
+            sudo chmod -R a+w "${DATA_SOURCE}/datadir/blockchain"
+            ls -al "${DATA_SOURCE}/datadir/blockchain"
+        elif [[ -d "${SHARED_BLOCK_LOG_DIR}" ]]; then
+            # Blockchain not in cache or empty - create symlinks to shared block_log
+            # Remove empty blockchain dir if it exists (leftover from Docker bind mounts)
+            if [[ -d "${DATADIR}/blockchain" ]] && [[ -z "$(ls -A "${DATADIR}/blockchain" 2>/dev/null)" ]]; then
+                rmdir "${DATADIR}/blockchain" 2>/dev/null || true
+            fi
+            echo "Blockchain not in cache, linking to shared block_log at ${SHARED_BLOCK_LOG_DIR}"
+            sudo -Enu hived mkdir -p "${DATADIR}/blockchain"
+            for block_file in "${SHARED_BLOCK_LOG_DIR}"/block_log* ; do
+                if [[ -f "$block_file" ]]; then
+                    local_name=$(basename "$block_file")
+                    sudo -Enu hived ln -sf "$block_file" "${DATADIR}/blockchain/${local_name}"
+                    echo "Linked: ${local_name}"
+                fi
+            done
+            ls -al "${DATADIR}/blockchain"
+        else
+            echo "WARNING: No blockchain in cache and shared block_log not found at ${SHARED_BLOCK_LOG_DIR}"
+        fi
+
+        if [[ -e "${DATA_SOURCE}/shm_dir" && "$(realpath "${DATA_SOURCE}/shm_dir")" != "$(realpath "${SHM_DIR}")" ]]
+        then
+            echo "Creating copy of ${DATA_SOURCE}/shm_dir inside ${SHM_DIR}"
+            sudo -Enu hived mkdir -p "${SHM_DIR}"
+            # Use cp without -p to avoid "Operation not supported" errors when copying from NFS
+            flock "${DATA_SOURCE}/datadir" sudo -En cp -r --no-preserve=mode,ownership "${DATA_SOURCE}/shm_dir"/* "${SHM_DIR}"
+            sudo chmod -R a+w "${SHM_DIR}"
+            ls -al "${SHM_DIR}"
+        else
+            echo "Skipping shm_dir processing."
+        fi
+        ls -al "${DATA_SOURCE}/datadir"
+    fi
+fi
diff --git a/haf-app-tools/scripts/create_haf_app_role.sh b/haf-app-tools/scripts/create_haf_app_role.sh
new file mode 100644
index 0000000..6f33ee1
--- /dev/null
+++ b/haf-app-tools/scripts/create_haf_app_role.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+#
+# Creates a HAF application role on a PostgreSQL cluster
+# Fetched from: common-ci-configuration/haf-app-tools/scripts/create_haf_app_role.sh
+#
+
+SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+
+LOG_FILE=setup_postgres.log
+
+# Source common.sh - try local first, then fetch from common-ci-configuration
+if [[ -f "$SCRIPTPATH/common.sh" ]]; then
+    source "$SCRIPTPATH/common.sh"
+else
+    COMMON_CI_URL="${COMMON_CI_URL:-https://gitlab.syncad.com/hive/common-ci-configuration/-/raw/develop}"
+    COMMON_SH="/tmp/haf-app-tools-common.sh"
+    if [[ ! -f "$COMMON_SH" ]]; then
+        curl -fsSL "${COMMON_CI_URL}/haf-app-tools/scripts/common.sh" -o "$COMMON_SH"
+    fi
+    source "$COMMON_SH"
+fi
+
+log_exec_params "$@"
+
+print_help () {
+    echo "Usage: $0 [OPTION[=VALUE]]..."
+    echo
+    echo "Creates a HAF app role on a PostgreSQL cluster."
+    echo "OPTIONS:"
+    echo "  --host=VALUE              Specify postgreSQL host location (defaults to /var/run/postgresql)."
+    echo "  --port=NUMBER             Specify a postgreSQL operating port (defaults to 5432)."
+    echo "  --postgres-url=URL        Specify postgreSQL connection url directly."
+    echo "  --haf-app-account=NAME    Specify an account name to be added to the base group."
+    echo "  --base-group=GROUP        Specify the base group (defaults to hive_applications_owner_group)."
+    echo "  --public                  Enable query_supervisor limiting for the haf_app_account."
+    echo "  --help                    Display this help screen and exit."
+    echo
+}
+
+create_haf_app_account() {
+  local pg_access="$1"
+  local haf_app_account="$2"
+  local is_public="$3"
+
+  local base_group="$BASE_GROUP"
+  local alter_to_public=""
+  $is_public && alter_to_public="ALTER ROLE ${haf_app_account} SET query_supervisor.limits_enabled TO true;"
+
+  psql -aw "$pg_access" -v ON_ERROR_STOP=on -f - <<EOF
+DO \$$
+BEGIN
+    BEGIN
+      CREATE ROLE $haf_app_account WITH LOGIN INHERIT IN ROLE ${base_group};
+      EXCEPTION WHEN DUPLICATE_OBJECT THEN
+      RAISE NOTICE '$haf_app_account role already exists';
+    END;
+    ${alter_to_public}
+END
+\$$;
+
+EOF
+
+}
+
+# Default values for variables
+HAF_APP_ACCOUNT=""
+POSTGRES_HOST="/var/run/postgresql"
+POSTGRES_PORT=5432
+POSTGRES_URL=""
+PUBLIC=false
+BASE_GROUP="hive_applications_owner_group"
+
+# Parse command line arguments
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --host=*)
+        POSTGRES_HOST="${1#*=}"
+        ;;
+    --port=*)
+        POSTGRES_PORT="${1#*=}"
+        ;;
+    --postgres-url=*)
+        POSTGRES_URL="${1#*=}"
+        ;;
+    --haf-app-account=*)
+        HAF_APP_ACCOUNT="${1#*=}"
+        ;;
+    --base-group=*)
+        BASE_GROUP="${1#*=}"
+        ;;
+    --public)
+        PUBLIC=true
+        ;;
+    --help)
+        print_help
+        exit 0
+        ;;
+    -*)
+        echo "ERROR: '$1' is not a valid option."
+        echo
+        print_help
+        exit 1
+        ;;
+    *)
+        echo "ERROR: '$1' is not a valid argument."
+        echo
+        print_help
+        exit 2
+        ;;
+  esac
+  shift
+done
+
+if [ -z "$POSTGRES_URL" ]; then
+  POSTGRES_ACCESS="postgresql://?dbname=haf_block_log&port=${POSTGRES_PORT}&host=${POSTGRES_HOST}"
+else
+  POSTGRES_ACCESS=$POSTGRES_URL
+fi
+
+# Ensure that the haf app account is specified
+_TST_HAF_APP_ACCOUNT=${HAF_APP_ACCOUNT:? "Missing application account name - it should be specified by using '--haf-app-account=name' option"}
+
+echo $POSTGRES_ACCESS
+
+create_haf_app_account "$POSTGRES_ACCESS" "$HAF_APP_ACCOUNT" ${PUBLIC}
-- 
GitLab


From 078c194e68177524055cd2bbae71509612beb38e Mon Sep 17 00:00:00 2001
From: Dan Notestein <dan@syncad.com>
Date: Sun, 4 Jan 2026 03:39:49 -0500
Subject: [PATCH 2/2] Add runtime fetch of run_hived_img.sh for HAF apps
 without hive submodule

When run_hived_img.sh is not found in common locations, fetch it
from the hive repository at runtime along with its dependency common.sh.
This allows HAF apps that have dropped the haf submodule to still perform
replay when a cache miss occurs.
---
 scripts/build_data.sh                        | 18 ++++++++++++++++--
 templates/haf_data_preparation.gitlab-ci.yml | 18 ++++++++++++++++--
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/scripts/build_data.sh b/scripts/build_data.sh
index 0905ed5..cc2849c 100755
--- a/scripts/build_data.sh
+++ b/scripts/build_data.sh
@@ -134,8 +134,22 @@ if [[ -z "$RUN_SCRIPT" ]]; then
     done
 
     if [[ -z "$RUN_SCRIPT" ]]; then
-        echo "ERROR: Could not find run_hived_img.sh. Specify --run-script=PATH"
-        exit 1
+        # Fallback: fetch run_hived_img.sh and common.sh from hive repo
+        echo "run_hived_img.sh not found locally, fetching from hive repo..."
+        HIVE_SCRIPTS_REF="${HIVE_SCRIPTS_REF:-develop}"
+        HIVE_RAW_URL="https://gitlab.syncad.com/hive/hive/-/raw/${HIVE_SCRIPTS_REF}/scripts"
+        HIVE_SCRIPTS_DIR="/tmp/hive-scripts"
+        mkdir -p "$HIVE_SCRIPTS_DIR"
+
+        # Fetch run_hived_img.sh and its dependency common.sh
+        for script in run_hived_img.sh common.sh; do
+            echo "Fetching $script from hive@${HIVE_SCRIPTS_REF}..."
+            curl -fsSL "${HIVE_RAW_URL}/${script}" -o "$HIVE_SCRIPTS_DIR/$script"
+            chmod +x "$HIVE_SCRIPTS_DIR/$script"
+        done
+
+        RUN_SCRIPT="$HIVE_SCRIPTS_DIR/run_hived_img.sh"
+        echo "Using fetched run_hived_img.sh from: $RUN_SCRIPT"
     fi
 fi
 
diff --git a/templates/haf_data_preparation.gitlab-ci.yml b/templates/haf_data_preparation.gitlab-ci.yml
index f1ac240..33f0b2d 100644
--- a/templates/haf_data_preparation.gitlab-ci.yml
+++ b/templates/haf_data_preparation.gitlab-ci.yml
@@ -192,8 +192,22 @@
         done
 
         if [[ -z "$RUN_SCRIPT" ]]; then
-          echo "ERROR: Could not find run_hived_img.sh"
-          exit 1
+          # Fallback: fetch run_hived_img.sh and common.sh from hive repo
+          echo "run_hived_img.sh not found locally, fetching from hive repo..."
+          HIVE_SCRIPTS_REF="${HIVE_SCRIPTS_REF:-develop}"
+          HIVE_RAW_URL="https://gitlab.syncad.com/hive/hive/-/raw/${HIVE_SCRIPTS_REF}/scripts"
+          HIVE_SCRIPTS_DIR="/tmp/hive-scripts"
+          mkdir -p "$HIVE_SCRIPTS_DIR"
+
+          # Fetch run_hived_img.sh and its dependency common.sh
+          for script in run_hived_img.sh common.sh; do
+            echo "Fetching $script from hive@${HIVE_SCRIPTS_REF}..."
+            curl -fsSL "${HIVE_RAW_URL}/${script}" -o "$HIVE_SCRIPTS_DIR/$script"
+            chmod +x "$HIVE_SCRIPTS_DIR/$script"
+          done
+
+          RUN_SCRIPT="$HIVE_SCRIPTS_DIR/run_hived_img.sh"
+          echo "Using fetched run_hived_img.sh from: $RUN_SCRIPT"
         fi
 
         # Run replay using build_data.sh from common-ci-configuration
-- 
GitLab