From db7e66f47ebad157599d05e07c2371b0b4b110f8 Mon Sep 17 00:00:00 2001
From: Marcin Ickiewicz <mickiewicz@syncad.com>
Date: Thu, 12 Nov 2020 19:29:06 +0100
Subject: [PATCH] GIN index uses to find posts by tag

-remove hive_post_tags table
-extend hive_posts for a column with an arrays of tags id
-tags are added next after a post, not in paralles as it was previously
-fix bug with not removing tags for a post whose author removed tags
during edition
---
 hive/db/db_state.py                           |  3 +-
 hive/db/schema.py                             | 27 +++----
 .../bridge_get_ranked_post_for_tag.sql        | 44 +++++------
 hive/db/sql_scripts/hive_post_operations.sql  | 26 +++++++
 .../upgrade/upgrade_table_schema.sql          | 19 +++++
 hive/indexer/blocks.py                        |  4 -
 hive/indexer/posts.py                         |  5 +-
 hive/indexer/tags.py                          | 74 -------------------
 tests/tests_api                               |  2 +-
 9 files changed, 78 insertions(+), 126 deletions(-)
 delete mode 100644 hive/indexer/tags.py

diff --git a/hive/db/db_state.py b/hive/db/db_state.py
index acdfe4136..76ba73754 100644
--- a/hive/db/db_state.py
+++ b/hive/db/db_state.py
@@ -119,6 +119,7 @@ class DbState:
             'hive_posts_updated_at_idx',
             'hive_posts_payout_plus_pending_payout_id_idx',
             'hive_posts_category_id_payout_plus_pending_payout_depth_idx',
+            'hive_posts_tags_ids_idx',
 
             'hive_posts_api_helper_author_s_permlink_idx',
 
@@ -130,8 +131,6 @@ class DbState:
             'hive_communities_block_num_idx',
             'hive_reblogs_created_at_idx',
 
-            'hive_post_tags_tag_id_idx',
-
             'hive_votes_voter_id_post_id_idx',
             'hive_votes_post_id_voter_id_idx',
 
diff --git a/hive/db/schema.py b/hive/db/schema.py
index 3d3140ee1..0817f6677 100644
--- a/hive/db/schema.py
+++ b/hive/db/schema.py
@@ -128,6 +128,7 @@ def build_metadata():
         sa.Column('beneficiaries', sa.JSON, nullable=False, server_default='[]'),
         sa.Column('block_num', sa.Integer,  nullable=False ),
         sa.Column('block_num_created', sa.Integer,  nullable=False ),
+        sa.Column('tags_ids', sa.ARRAY(sa.Integer),  nullable=True ),
 
         sa.ForeignKeyConstraint(['author_id'], ['hive_accounts.id'], name='hive_posts_fk1'),
         sa.ForeignKeyConstraint(['root_id'], ['hive_posts.id'], name='hive_posts_fk2'),
@@ -152,8 +153,9 @@ def build_metadata():
         sa.Index('hive_posts_cashout_time_id_idx', 'cashout_time', 'id'),
         sa.Index('hive_posts_updated_at_idx', sa.text('updated_at DESC')),
         sa.Index('hive_posts_payout_plus_pending_payout_id_idx', sa.text('(payout+pending_payout), id, is_paidout'), postgresql_where=sql_text("counter_deleted = 0 AND NOT is_paidout")),
-        sa.Index('hive_posts_category_id_payout_plus_pending_payout_depth_idx', sa.text('category_id, (payout+pending_payout), depth'), postgresql_where=sql_text("NOT is_paidout AND counter_deleted = 0"))
-    )
+        sa.Index('hive_posts_category_id_payout_plus_pending_payout_depth_idx', sa.text('category_id, (payout+pending_payout), depth'), postgresql_where=sql_text("NOT is_paidout AND counter_deleted = 0")),
+        sa.Index('hive_posts_tags_ids_idx', 'tags_ids', postgresql_using="gin", postgresql_ops={'tags_ids': 'gin__int_ops'})
+        )
 
     sa.Table(
         'hive_post_data', metadata,
@@ -215,18 +217,6 @@ def build_metadata():
         sa.UniqueConstraint('tag', name='hive_tag_data_ux1')
     )
 
-    sa.Table(
-        'hive_post_tags', metadata,
-        sa.Column('post_id', sa.Integer, nullable=False),
-        sa.Column('tag_id', sa.Integer, nullable=False),
-        sa.PrimaryKeyConstraint('post_id', 'tag_id', name='hive_post_tags_pk1'),
-
-        sa.ForeignKeyConstraint(['post_id'], ['hive_posts.id'], name='hive_post_tags_fk1'),
-        sa.ForeignKeyConstraint(['tag_id'], ['hive_tag_data.id'], name='hive_post_tags_fk2'),
-
-        sa.Index('hive_post_tags_tag_id_idx', 'tag_id')
-    )
-
     sa.Table(
         'hive_follows', metadata,
         sa.Column('id', sa.Integer, primary_key=True ),
@@ -457,6 +447,9 @@ def create_fk(db):
 
 def setup(db):
     """Creates all tables and seed data"""
+
+    sql = """SELECT * FROM pg_extension WHERE extname='intarray'"""
+    assert db.query_row( sql ), "The database requires created 'intarray' extension"
     # initialize schema
     build_metadata().create_all(db.engine())
 
@@ -617,8 +610,8 @@ def setup(db):
     dir_path = dirname(realpath(__file__))
     for script in sql_scripts:
         execute_sql_script(db.query_no_return, "{}/sql_scripts/{}".format(dir_path, script))
-    
-    
+
+
 
 
 
@@ -631,7 +624,6 @@ def reset_autovac(db):
     autovac_config = { #    vacuum  analyze
         'hive_accounts':    (50000, 100000),
         'hive_posts':       (2500, 10000),
-        'hive_post_tags':   (5000, 10000),
         'hive_follows':     (5000, 5000),
         'hive_feed_cache':  (5000, 5000),
         'hive_blocks':      (5000, 25000),
@@ -667,7 +659,6 @@ def set_logged_table_attribute(db, logged):
     logged_config = [
         'hive_accounts',
         'hive_permlink_data',
-        'hive_post_tags',
         'hive_posts',
         'hive_post_data',
         'hive_votes',
diff --git a/hive/db/sql_scripts/bridge_get_ranked_post_for_tag.sql b/hive/db/sql_scripts/bridge_get_ranked_post_for_tag.sql
index 3b1851cbf..adbacf9c4 100644
--- a/hive/db/sql_scripts/bridge_get_ranked_post_for_tag.sql
+++ b/hive/db/sql_scripts/bridge_get_ranked_post_for_tag.sql
@@ -5,11 +5,11 @@ AS
 $function$
 DECLARE
   __post_id INT;
-  __hive_tag INT;
+  __hive_tag INT[];
   __observer_id INT;
 BEGIN
   __post_id = find_comment_id( _author, _permlink, True );
-  __hive_tag = find_tag_id( _tag, True );
+  __hive_tag = ARRAY_APPEND( __hive_tag, find_tag_id( _tag, True ));
   __observer_id = find_account_id(_observer, False);
   RETURN QUERY SELECT
       hp.id,
@@ -54,10 +54,9 @@ BEGIN
       SELECT
           hp1.id
       FROM
-          hive_post_tags hpt
-          JOIN hive_posts hp1 ON hp1.id = hpt.post_id
+          hive_posts hp1
           JOIN hive_accounts_view ha ON hp1.author_id = ha.id
-      WHERE hpt.tag_id = __hive_tag AND hp1.counter_deleted = 0 AND hp1.depth = 0 AND NOT ha.is_grayed AND ( __post_id = 0 OR hp1.id < __post_id )
+      WHERE hp1.tags_ids @> __hive_tag AND hp1.counter_deleted = 0 AND hp1.depth = 0 AND NOT ha.is_grayed AND ( __post_id = 0 OR hp1.id < __post_id )
       --ORDER BY hp1.id + 0 DESC -- this workaround helped the query to better choose indexes, but after some time it started to significally slow down
       AND (NOT EXISTS (SELECT 1 FROM muted_accounts_by_id_view WHERE observer_id = __observer_id AND muted_id = hp1.author_id))
       ORDER BY hp1.id DESC
@@ -78,14 +77,14 @@ $function$
 DECLARE
   __post_id INT;
   __hot_limit FLOAT;
-  __hive_tag INT;
+  __hive_tag INT[];
   __observer_id INT;
 BEGIN
   __post_id = find_comment_id( _author, _permlink, True );
   IF __post_id <> 0 THEN
       SELECT hp.sc_hot INTO __hot_limit FROM hive_posts hp WHERE hp.id = __post_id;
   END IF;
-  __hive_tag = find_tag_id( _tag, True );
+  __hive_tag = ARRAY_APPEND( __hive_tag, find_tag_id( _tag, True ));
   __observer_id = find_account_id(_observer, False);
   RETURN QUERY SELECT
       hp.id,
@@ -131,9 +130,8 @@ BEGIN
           hp1.id
         , hp1.sc_hot as hot
       FROM
-          hive_post_tags hpt
-          JOIN hive_posts hp1 ON hp1.id = hpt.post_id
-      WHERE hpt.tag_id = __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND hp1.depth = 0
+          hive_posts hp1
+      WHERE hp1.tags_ids @> __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND hp1.depth = 0
           AND ( __post_id = 0 OR hp1.sc_hot < __hot_limit OR ( hp1.sc_hot = __hot_limit AND hp1.id < __post_id ) )
           AND (NOT EXISTS (SELECT 1 FROM muted_accounts_by_id_view WHERE observer_id = __observer_id AND muted_id = hp1.author_id))
       ORDER BY hp1.sc_hot DESC, hp1.id DESC
@@ -154,13 +152,13 @@ $function$
 DECLARE
   __post_id INT;
   __payout_limit hive_posts.payout%TYPE;
-  __hive_tag INT;
+  __hive_tag INT[];
 BEGIN
   __post_id = find_comment_id( _author, _permlink, True );
   IF __post_id <> 0 THEN
       SELECT ( hp.payout + hp.pending_payout ) INTO __payout_limit FROM hive_posts hp WHERE hp.id = __post_id;
   END IF;
-  __hive_tag = find_tag_id( _tag, True );
+  __hive_tag = ARRAY_APPEND( __hive_tag, find_tag_id( _tag, True ) );
   RETURN QUERY SELECT
       hp.id,
       hp.author,
@@ -206,9 +204,8 @@ BEGIN
         , ( hp1.payout + hp1.pending_payout ) as all_payout
       FROM
           hive_posts hp1
-          JOIN hive_post_tags hpt ON hp1.id = hpt.post_id
           JOIN hive_accounts_view ha ON hp1.author_id = ha.id
-      WHERE hpt.tag_id = __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND ha.is_grayed AND ( hp1.payout + hp1.pending_payout ) > 0
+      WHERE hp1.tags_ids @> __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND ha.is_grayed AND ( hp1.payout + hp1.pending_payout ) > 0
           AND ( __post_id = 0 OR ( hp1.payout + hp1.pending_payout ) < __payout_limit OR ( ( hp1.payout + hp1.pending_payout ) = __payout_limit AND hp1.id < __post_id ) )
       ORDER BY ( hp1.payout + hp1.pending_payout ) DESC, hp1.id DESC
       LIMIT _limit
@@ -381,14 +378,14 @@ $function$
 DECLARE
   __post_id INT;
   __promoted_limit hive_posts.promoted%TYPE;
-  __hive_tag INT;
+  __hive_tag INT[];
   __observer_id INT;
 BEGIN
   __post_id = find_comment_id( _author, _permlink, True );
   IF __post_id <> 0 THEN
       SELECT hp.promoted INTO __promoted_limit FROM hive_posts hp WHERE hp.id = __post_id;
   END IF;
-  __hive_tag = find_tag_id( _tag, True );
+  __hive_tag = ARRAY_APPEND( __hive_tag,  find_tag_id( _tag, True ) );
   __observer_id = find_account_id(_observer, False);
   RETURN QUERY SELECT
       hp.id,
@@ -434,9 +431,8 @@ BEGIN
           hp1.id
         , hp1.promoted as promoted
       FROM
-          hive_post_tags hpt
-          JOIN hive_posts hp1 ON hp1.id = hpt.post_id
-      WHERE hpt.tag_id = __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND hp1.promoted > 0
+          hive_posts hp1
+      WHERE hp1.tags_ids @> __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND hp1.promoted > 0
           AND ( __post_id = 0 OR hp1.promoted < __promoted_limit OR ( hp1.promoted = __promoted_limit AND hp1.id < __post_id ) )
           AND (NOT EXISTS (SELECT 1 FROM muted_accounts_by_id_view WHERE observer_id = __observer_id AND muted_id = hp1.author_id))
       ORDER BY hp1.promoted DESC, hp1.id DESC
@@ -457,14 +453,14 @@ $function$
 DECLARE
   __post_id INT;
   __trending_limit FLOAT;
-  __hive_tag INT;
+  __hive_tag INT[];
   __observer_id INT;
 BEGIN
   __post_id = find_comment_id( _author, _permlink, True );
   IF __post_id <> 0 THEN
       SELECT hp.sc_trend INTO __trending_limit FROM hive_posts hp WHERE hp.id = __post_id;
   END IF;
-  __hive_tag = find_tag_id( _tag, True );
+  __hive_tag = ARRAY_APPEND( __hive_tag, find_tag_id( _tag, True ));
   __observer_id = find_account_id(_observer, False);
   RETURN QUERY SELECT
       hp.id,
@@ -510,15 +506,15 @@ BEGIN
           hp1.id
         , hp1.sc_trend as trend
       FROM
-          hive_post_tags hpt
-      JOIN hive_posts hp1 ON hp1.id = hpt.post_id
-      WHERE hpt.tag_id = __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND hp1.depth = 0
+         hive_posts hp1
+      WHERE hp1.tags_ids @> __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND hp1.depth = 0
           AND ( __post_id = 0 OR hp1.sc_trend < __trending_limit OR ( hp1.sc_trend = __trending_limit AND hp1.id < __post_id ) )
           AND (NOT EXISTS (SELECT 1 FROM muted_accounts_by_id_view WHERE observer_id = __observer_id AND muted_id = hp1.author_id))
       ORDER BY hp1.sc_trend DESC, hp1.id DESC
       LIMIT _limit
   ) as trends
   JOIN hive_posts_view hp ON hp.id = trends.id
+  WHERE (CASE WHEN _observer IS NOT NULL THEN NOT EXISTS (SELECT 1 FROM muted_accounts_view WHERE observer = _observer AND muted = hp.author) ELSE true END)
   ORDER BY trends.trend DESC, trends.id DESC
   LIMIT _limit;
 END
diff --git a/hive/db/sql_scripts/hive_post_operations.sql b/hive/db/sql_scripts/hive_post_operations.sql
index 2bf71e82f..0654a4dcd 100644
--- a/hive/db/sql_scripts/hive_post_operations.sql
+++ b/hive/db/sql_scripts/hive_post_operations.sql
@@ -138,3 +138,29 @@ BEGIN
 END
 $function$
 ;
+
+DROP FUNCTION IF EXISTS add_tags;
+CREATE FUNCTION add_tags( in _post_id hive_posts.id%TYPE, in _tags VARCHAR[] )
+RETURNS void
+LANGUAGE 'plpgsql'
+VOLATILE
+AS
+$function$
+DECLARE
+	__tags_ids INTEGER[];
+BEGIN
+	WITH tags_ids(id) AS
+	(
+		INSERT INTO
+			hive_tag_data AS htd(tag)
+		SELECT UNNEST( _tags )
+		ON CONFLICT("tag") DO UPDATE SET tag=EXCLUDED.tag --trick to always return id
+		RETURNING htd.id
+	)
+	SELECT ARRAY_AGG( id ) INTO __tags_ids FROM tags_ids;
+
+	UPDATE hive_posts hp
+	SET tags_ids = __tags_ids
+	WHERE hp.id = _post_id;
+END
+$function$
diff --git a/hive/db/sql_scripts/upgrade/upgrade_table_schema.sql b/hive/db/sql_scripts/upgrade/upgrade_table_schema.sql
index e0e6d418c..fe90715f6 100644
--- a/hive/db/sql_scripts/upgrade/upgrade_table_schema.sql
+++ b/hive/db/sql_scripts/upgrade/upgrade_table_schema.sql
@@ -1,3 +1,8 @@
+do $$
+BEGIN
+   ASSERT EXISTS (SELECT * FROM pg_extension WHERE extname='intarray'), 'The database requires created "intarray" extension';
+END$$;
+
 CREATE TABLE IF NOT EXISTS hive_db_patch_level
 (
   level SERIAL NOT NULL PRIMARY KEY,
@@ -215,6 +220,15 @@ IF NOT EXISTS (SELECT data_type FROM information_schema.columns
 ELSE
   RAISE NOTICE 'SKIPPING hive_posts upgrade - adding total_votes and net_votes columns';
 END IF;
+
+IF NOT EXISTS(SELECT data_type FROM information_schema.columns
+          WHERE table_name = 'hive_posts' AND column_name = 'tags_ids') THEN
+    ALTER TABLE ONLY hive_posts
+            ADD COLUMN tags_ids INTEGER[];
+ELSE
+    RAISE NOTICE 'SKIPPING hive_posts upgrade - adding a tags_ids column';
+END IF;
+
 END
 
 $BODY$
@@ -390,3 +404,8 @@ DROP INDEX IF EXISTS hive_posts_promoted_idx;
 CREATE INDEX IF NOT EXISTS hive_posts_promoted_id_idx ON hive_posts (promoted, id)
   WHERE NOT is_paidout AND counter_deleted = 0
  ;
+
+
+ CREATE INDEX IF NOT EXISTS hive_posts_tags_ids_idx ON hive_posts USING gin(tags_ids gin__int_ops);
+
+ DROP TABLE IF EXISTS hive_post_tags;
diff --git a/hive/indexer/blocks.py b/hive/indexer/blocks.py
index 8b5783c2d..2de572b06 100644
--- a/hive/indexer/blocks.py
+++ b/hive/indexer/blocks.py
@@ -14,7 +14,6 @@ from hive.indexer.payments import Payments
 from hive.indexer.follow import Follow
 from hive.indexer.votes import Votes
 from hive.indexer.post_data_cache import PostDataCache
-from hive.indexer.tags import Tags
 from hive.indexer.reputations import Reputations
 from hive.indexer.reblog import Reblog
 from hive.indexer.notify import Notify
@@ -49,7 +48,6 @@ class Blocks:
       ('PostDataCache', PostDataCache.flush, PostDataCache),
       ('Reputations', Reputations.flush, Reputations),
       ('Votes', Votes.flush, Votes),
-      ('Tags', Tags.flush, Tags),
       ('Follow', Follow.flush, Follow),
       ('Reblog', Reblog.flush, Reblog),
       ('Notify', Notify.flush, Notify),
@@ -70,7 +68,6 @@ class Blocks:
         PostDataCache.setup_own_db_access(sharedDbAdapter)
         Reputations.setup_own_db_access(sharedDbAdapter)
         Votes.setup_own_db_access(sharedDbAdapter)
-        Tags.setup_own_db_access(sharedDbAdapter)
         Follow.setup_own_db_access(sharedDbAdapter)
         Posts.setup_own_db_access(sharedDbAdapter)
         Reblog.setup_own_db_access(sharedDbAdapter)
@@ -413,7 +410,6 @@ class Blocks:
 
             # remove posts: core, tags, cache entries
             if post_ids:
-                DB.query("DELETE FROM hive_post_tags   WHERE post_id IN :ids", ids=post_ids)
                 DB.query("DELETE FROM hive_posts       WHERE id      IN :ids", ids=post_ids)
                 DB.query("DELETE FROM hive_post_data   WHERE id      IN :ids", ids=post_ids)
 
diff --git a/hive/indexer/posts.py b/hive/indexer/posts.py
index 08edcbb58..c36811488 100644
--- a/hive/indexer/posts.py
+++ b/hive/indexer/posts.py
@@ -14,7 +14,6 @@ from hive.indexer.feed_cache import FeedCache
 from hive.indexer.community import Community
 from hive.indexer.notify import Notify
 from hive.indexer.post_data_cache import PostDataCache
-from hive.indexer.tags import Tags
 from hive.indexer.db_adapter_holder import DbAdapterHolder
 from hive.utils.misc import chunks
 
@@ -152,8 +151,8 @@ class Posts(DbAdapterHolder):
             from funcy.seqs import distinct
             tags = list(distinct(tags))[:5]
 
-            for tag in tags:
-                Tags.add_tag(result['id'], tag)
+            sql = """SELECT add_tags( (:post_id)::INTEGER, (:tags)::VARCHAR[] )"""
+            DB.query_row( sql, post_id = result['id'], tags=tags );
 
         if not DbState.is_initial_sync():
             if error:
diff --git a/hive/indexer/tags.py b/hive/indexer/tags.py
deleted file mode 100644
index b3b25e82e..000000000
--- a/hive/indexer/tags.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import logging
-from hive.indexer.db_adapter_holder import DbAdapterHolder
-
-log = logging.getLogger(__name__)
-
-from hive.utils.normalize import escape_characters
-
-class Tags(DbAdapterHolder):
-    """ Tags cache """
-    _tags = []
-
-    @classmethod
-    def add_tag(cls, tid, tag):
-        """ Add tag to cache """
-        cls._tags.append((tid, tag))
-
-    @classmethod
-    def flush(cls):
-        """ Flush tags to table """        
-        if cls._tags:
-            cls.beginTx()
-            limit = 1000
-
-            sql = """
-                INSERT INTO
-                    hive_tag_data (tag)
-                VALUES {} 
-                ON CONFLICT DO NOTHING
-            """
-            values = []
-            for tag in cls._tags:
-                values.append("({})".format(escape_characters(tag[1])))
-                if len(values) >= limit:
-                    tag_query = str(sql)
-                    cls.db.query(tag_query.format(','.join(values)))
-                    values.clear()
-            if len(values) > 0:
-                tag_query = str(sql)
-                cls.db.query(tag_query.format(','.join(values)))
-                values.clear()
-
-            sql = """
-                INSERT INTO
-                    hive_post_tags (post_id, tag_id)
-                SELECT 
-                    data_source.post_id, data_source.tag_id
-                FROM
-                (
-                    SELECT 
-                        post_id, htd.id
-                    FROM
-                    (
-                        VALUES 
-                            {}
-                    ) AS T(post_id, tag)
-                    INNER JOIN hive_tag_data htd ON htd.tag = T.tag
-                ) AS data_source(post_id, tag_id)
-                ON CONFLICT DO NOTHING
-            """
-            values = []
-            for tag in cls._tags:
-                values.append("({}, {})".format(tag[0], escape_characters(tag[1])))
-                if len(values) >= limit:
-                    tag_query = str(sql)
-                    cls.db.query(tag_query.format(','.join(values)))
-                    values.clear()
-            if len(values) > 0:
-                tag_query = str(sql)
-                cls.db.query(tag_query.format(','.join(values)))
-                values.clear()
-            cls.commitTx()
-        n = len(cls._tags)
-        cls._tags.clear()
-        return n
diff --git a/tests/tests_api b/tests/tests_api
index 0f0fd1af6..1ffd591d3 160000
--- a/tests/tests_api
+++ b/tests/tests_api
@@ -1 +1 @@
-Subproject commit 0f0fd1af6d7e367849a87443c0137702b135e297
+Subproject commit 1ffd591d38c5e764e8a3910af2d5548d8b28a55b
-- 
GitLab