From db7e66f47ebad157599d05e07c2371b0b4b110f8 Mon Sep 17 00:00:00 2001 From: Marcin Ickiewicz <mickiewicz@syncad.com> Date: Thu, 12 Nov 2020 19:29:06 +0100 Subject: [PATCH] GIN index uses to find posts by tag -remove hive_post_tags table -extend hive_posts for a column with an arrays of tags id -tags are added next after a post, not in paralles as it was previously -fix bug with not removing tags for a post whose author removed tags during edition --- hive/db/db_state.py | 3 +- hive/db/schema.py | 27 +++---- .../bridge_get_ranked_post_for_tag.sql | 44 +++++------ hive/db/sql_scripts/hive_post_operations.sql | 26 +++++++ .../upgrade/upgrade_table_schema.sql | 19 +++++ hive/indexer/blocks.py | 4 - hive/indexer/posts.py | 5 +- hive/indexer/tags.py | 74 ------------------- tests/tests_api | 2 +- 9 files changed, 78 insertions(+), 126 deletions(-) delete mode 100644 hive/indexer/tags.py diff --git a/hive/db/db_state.py b/hive/db/db_state.py index acdfe4136..76ba73754 100644 --- a/hive/db/db_state.py +++ b/hive/db/db_state.py @@ -119,6 +119,7 @@ class DbState: 'hive_posts_updated_at_idx', 'hive_posts_payout_plus_pending_payout_id_idx', 'hive_posts_category_id_payout_plus_pending_payout_depth_idx', + 'hive_posts_tags_ids_idx', 'hive_posts_api_helper_author_s_permlink_idx', @@ -130,8 +131,6 @@ class DbState: 'hive_communities_block_num_idx', 'hive_reblogs_created_at_idx', - 'hive_post_tags_tag_id_idx', - 'hive_votes_voter_id_post_id_idx', 'hive_votes_post_id_voter_id_idx', diff --git a/hive/db/schema.py b/hive/db/schema.py index 3d3140ee1..0817f6677 100644 --- a/hive/db/schema.py +++ b/hive/db/schema.py @@ -128,6 +128,7 @@ def build_metadata(): sa.Column('beneficiaries', sa.JSON, nullable=False, server_default='[]'), sa.Column('block_num', sa.Integer, nullable=False ), sa.Column('block_num_created', sa.Integer, nullable=False ), + sa.Column('tags_ids', sa.ARRAY(sa.Integer), nullable=True ), sa.ForeignKeyConstraint(['author_id'], ['hive_accounts.id'], name='hive_posts_fk1'), sa.ForeignKeyConstraint(['root_id'], ['hive_posts.id'], name='hive_posts_fk2'), @@ -152,8 +153,9 @@ def build_metadata(): sa.Index('hive_posts_cashout_time_id_idx', 'cashout_time', 'id'), sa.Index('hive_posts_updated_at_idx', sa.text('updated_at DESC')), sa.Index('hive_posts_payout_plus_pending_payout_id_idx', sa.text('(payout+pending_payout), id, is_paidout'), postgresql_where=sql_text("counter_deleted = 0 AND NOT is_paidout")), - sa.Index('hive_posts_category_id_payout_plus_pending_payout_depth_idx', sa.text('category_id, (payout+pending_payout), depth'), postgresql_where=sql_text("NOT is_paidout AND counter_deleted = 0")) - ) + sa.Index('hive_posts_category_id_payout_plus_pending_payout_depth_idx', sa.text('category_id, (payout+pending_payout), depth'), postgresql_where=sql_text("NOT is_paidout AND counter_deleted = 0")), + sa.Index('hive_posts_tags_ids_idx', 'tags_ids', postgresql_using="gin", postgresql_ops={'tags_ids': 'gin__int_ops'}) + ) sa.Table( 'hive_post_data', metadata, @@ -215,18 +217,6 @@ def build_metadata(): sa.UniqueConstraint('tag', name='hive_tag_data_ux1') ) - sa.Table( - 'hive_post_tags', metadata, - sa.Column('post_id', sa.Integer, nullable=False), - sa.Column('tag_id', sa.Integer, nullable=False), - sa.PrimaryKeyConstraint('post_id', 'tag_id', name='hive_post_tags_pk1'), - - sa.ForeignKeyConstraint(['post_id'], ['hive_posts.id'], name='hive_post_tags_fk1'), - sa.ForeignKeyConstraint(['tag_id'], ['hive_tag_data.id'], name='hive_post_tags_fk2'), - - sa.Index('hive_post_tags_tag_id_idx', 'tag_id') - ) - sa.Table( 'hive_follows', metadata, sa.Column('id', sa.Integer, primary_key=True ), @@ -457,6 +447,9 @@ def create_fk(db): def setup(db): """Creates all tables and seed data""" + + sql = """SELECT * FROM pg_extension WHERE extname='intarray'""" + assert db.query_row( sql ), "The database requires created 'intarray' extension" # initialize schema build_metadata().create_all(db.engine()) @@ -617,8 +610,8 @@ def setup(db): dir_path = dirname(realpath(__file__)) for script in sql_scripts: execute_sql_script(db.query_no_return, "{}/sql_scripts/{}".format(dir_path, script)) - - + + @@ -631,7 +624,6 @@ def reset_autovac(db): autovac_config = { # vacuum analyze 'hive_accounts': (50000, 100000), 'hive_posts': (2500, 10000), - 'hive_post_tags': (5000, 10000), 'hive_follows': (5000, 5000), 'hive_feed_cache': (5000, 5000), 'hive_blocks': (5000, 25000), @@ -667,7 +659,6 @@ def set_logged_table_attribute(db, logged): logged_config = [ 'hive_accounts', 'hive_permlink_data', - 'hive_post_tags', 'hive_posts', 'hive_post_data', 'hive_votes', diff --git a/hive/db/sql_scripts/bridge_get_ranked_post_for_tag.sql b/hive/db/sql_scripts/bridge_get_ranked_post_for_tag.sql index 3b1851cbf..adbacf9c4 100644 --- a/hive/db/sql_scripts/bridge_get_ranked_post_for_tag.sql +++ b/hive/db/sql_scripts/bridge_get_ranked_post_for_tag.sql @@ -5,11 +5,11 @@ AS $function$ DECLARE __post_id INT; - __hive_tag INT; + __hive_tag INT[]; __observer_id INT; BEGIN __post_id = find_comment_id( _author, _permlink, True ); - __hive_tag = find_tag_id( _tag, True ); + __hive_tag = ARRAY_APPEND( __hive_tag, find_tag_id( _tag, True )); __observer_id = find_account_id(_observer, False); RETURN QUERY SELECT hp.id, @@ -54,10 +54,9 @@ BEGIN SELECT hp1.id FROM - hive_post_tags hpt - JOIN hive_posts hp1 ON hp1.id = hpt.post_id + hive_posts hp1 JOIN hive_accounts_view ha ON hp1.author_id = ha.id - WHERE hpt.tag_id = __hive_tag AND hp1.counter_deleted = 0 AND hp1.depth = 0 AND NOT ha.is_grayed AND ( __post_id = 0 OR hp1.id < __post_id ) + WHERE hp1.tags_ids @> __hive_tag AND hp1.counter_deleted = 0 AND hp1.depth = 0 AND NOT ha.is_grayed AND ( __post_id = 0 OR hp1.id < __post_id ) --ORDER BY hp1.id + 0 DESC -- this workaround helped the query to better choose indexes, but after some time it started to significally slow down AND (NOT EXISTS (SELECT 1 FROM muted_accounts_by_id_view WHERE observer_id = __observer_id AND muted_id = hp1.author_id)) ORDER BY hp1.id DESC @@ -78,14 +77,14 @@ $function$ DECLARE __post_id INT; __hot_limit FLOAT; - __hive_tag INT; + __hive_tag INT[]; __observer_id INT; BEGIN __post_id = find_comment_id( _author, _permlink, True ); IF __post_id <> 0 THEN SELECT hp.sc_hot INTO __hot_limit FROM hive_posts hp WHERE hp.id = __post_id; END IF; - __hive_tag = find_tag_id( _tag, True ); + __hive_tag = ARRAY_APPEND( __hive_tag, find_tag_id( _tag, True )); __observer_id = find_account_id(_observer, False); RETURN QUERY SELECT hp.id, @@ -131,9 +130,8 @@ BEGIN hp1.id , hp1.sc_hot as hot FROM - hive_post_tags hpt - JOIN hive_posts hp1 ON hp1.id = hpt.post_id - WHERE hpt.tag_id = __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND hp1.depth = 0 + hive_posts hp1 + WHERE hp1.tags_ids @> __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND hp1.depth = 0 AND ( __post_id = 0 OR hp1.sc_hot < __hot_limit OR ( hp1.sc_hot = __hot_limit AND hp1.id < __post_id ) ) AND (NOT EXISTS (SELECT 1 FROM muted_accounts_by_id_view WHERE observer_id = __observer_id AND muted_id = hp1.author_id)) ORDER BY hp1.sc_hot DESC, hp1.id DESC @@ -154,13 +152,13 @@ $function$ DECLARE __post_id INT; __payout_limit hive_posts.payout%TYPE; - __hive_tag INT; + __hive_tag INT[]; BEGIN __post_id = find_comment_id( _author, _permlink, True ); IF __post_id <> 0 THEN SELECT ( hp.payout + hp.pending_payout ) INTO __payout_limit FROM hive_posts hp WHERE hp.id = __post_id; END IF; - __hive_tag = find_tag_id( _tag, True ); + __hive_tag = ARRAY_APPEND( __hive_tag, find_tag_id( _tag, True ) ); RETURN QUERY SELECT hp.id, hp.author, @@ -206,9 +204,8 @@ BEGIN , ( hp1.payout + hp1.pending_payout ) as all_payout FROM hive_posts hp1 - JOIN hive_post_tags hpt ON hp1.id = hpt.post_id JOIN hive_accounts_view ha ON hp1.author_id = ha.id - WHERE hpt.tag_id = __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND ha.is_grayed AND ( hp1.payout + hp1.pending_payout ) > 0 + WHERE hp1.tags_ids @> __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND ha.is_grayed AND ( hp1.payout + hp1.pending_payout ) > 0 AND ( __post_id = 0 OR ( hp1.payout + hp1.pending_payout ) < __payout_limit OR ( ( hp1.payout + hp1.pending_payout ) = __payout_limit AND hp1.id < __post_id ) ) ORDER BY ( hp1.payout + hp1.pending_payout ) DESC, hp1.id DESC LIMIT _limit @@ -381,14 +378,14 @@ $function$ DECLARE __post_id INT; __promoted_limit hive_posts.promoted%TYPE; - __hive_tag INT; + __hive_tag INT[]; __observer_id INT; BEGIN __post_id = find_comment_id( _author, _permlink, True ); IF __post_id <> 0 THEN SELECT hp.promoted INTO __promoted_limit FROM hive_posts hp WHERE hp.id = __post_id; END IF; - __hive_tag = find_tag_id( _tag, True ); + __hive_tag = ARRAY_APPEND( __hive_tag, find_tag_id( _tag, True ) ); __observer_id = find_account_id(_observer, False); RETURN QUERY SELECT hp.id, @@ -434,9 +431,8 @@ BEGIN hp1.id , hp1.promoted as promoted FROM - hive_post_tags hpt - JOIN hive_posts hp1 ON hp1.id = hpt.post_id - WHERE hpt.tag_id = __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND hp1.promoted > 0 + hive_posts hp1 + WHERE hp1.tags_ids @> __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND hp1.promoted > 0 AND ( __post_id = 0 OR hp1.promoted < __promoted_limit OR ( hp1.promoted = __promoted_limit AND hp1.id < __post_id ) ) AND (NOT EXISTS (SELECT 1 FROM muted_accounts_by_id_view WHERE observer_id = __observer_id AND muted_id = hp1.author_id)) ORDER BY hp1.promoted DESC, hp1.id DESC @@ -457,14 +453,14 @@ $function$ DECLARE __post_id INT; __trending_limit FLOAT; - __hive_tag INT; + __hive_tag INT[]; __observer_id INT; BEGIN __post_id = find_comment_id( _author, _permlink, True ); IF __post_id <> 0 THEN SELECT hp.sc_trend INTO __trending_limit FROM hive_posts hp WHERE hp.id = __post_id; END IF; - __hive_tag = find_tag_id( _tag, True ); + __hive_tag = ARRAY_APPEND( __hive_tag, find_tag_id( _tag, True )); __observer_id = find_account_id(_observer, False); RETURN QUERY SELECT hp.id, @@ -510,15 +506,15 @@ BEGIN hp1.id , hp1.sc_trend as trend FROM - hive_post_tags hpt - JOIN hive_posts hp1 ON hp1.id = hpt.post_id - WHERE hpt.tag_id = __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND hp1.depth = 0 + hive_posts hp1 + WHERE hp1.tags_ids @> __hive_tag AND hp1.counter_deleted = 0 AND NOT hp1.is_paidout AND hp1.depth = 0 AND ( __post_id = 0 OR hp1.sc_trend < __trending_limit OR ( hp1.sc_trend = __trending_limit AND hp1.id < __post_id ) ) AND (NOT EXISTS (SELECT 1 FROM muted_accounts_by_id_view WHERE observer_id = __observer_id AND muted_id = hp1.author_id)) ORDER BY hp1.sc_trend DESC, hp1.id DESC LIMIT _limit ) as trends JOIN hive_posts_view hp ON hp.id = trends.id + WHERE (CASE WHEN _observer IS NOT NULL THEN NOT EXISTS (SELECT 1 FROM muted_accounts_view WHERE observer = _observer AND muted = hp.author) ELSE true END) ORDER BY trends.trend DESC, trends.id DESC LIMIT _limit; END diff --git a/hive/db/sql_scripts/hive_post_operations.sql b/hive/db/sql_scripts/hive_post_operations.sql index 2bf71e82f..0654a4dcd 100644 --- a/hive/db/sql_scripts/hive_post_operations.sql +++ b/hive/db/sql_scripts/hive_post_operations.sql @@ -138,3 +138,29 @@ BEGIN END $function$ ; + +DROP FUNCTION IF EXISTS add_tags; +CREATE FUNCTION add_tags( in _post_id hive_posts.id%TYPE, in _tags VARCHAR[] ) +RETURNS void +LANGUAGE 'plpgsql' +VOLATILE +AS +$function$ +DECLARE + __tags_ids INTEGER[]; +BEGIN + WITH tags_ids(id) AS + ( + INSERT INTO + hive_tag_data AS htd(tag) + SELECT UNNEST( _tags ) + ON CONFLICT("tag") DO UPDATE SET tag=EXCLUDED.tag --trick to always return id + RETURNING htd.id + ) + SELECT ARRAY_AGG( id ) INTO __tags_ids FROM tags_ids; + + UPDATE hive_posts hp + SET tags_ids = __tags_ids + WHERE hp.id = _post_id; +END +$function$ diff --git a/hive/db/sql_scripts/upgrade/upgrade_table_schema.sql b/hive/db/sql_scripts/upgrade/upgrade_table_schema.sql index e0e6d418c..fe90715f6 100644 --- a/hive/db/sql_scripts/upgrade/upgrade_table_schema.sql +++ b/hive/db/sql_scripts/upgrade/upgrade_table_schema.sql @@ -1,3 +1,8 @@ +do $$ +BEGIN + ASSERT EXISTS (SELECT * FROM pg_extension WHERE extname='intarray'), 'The database requires created "intarray" extension'; +END$$; + CREATE TABLE IF NOT EXISTS hive_db_patch_level ( level SERIAL NOT NULL PRIMARY KEY, @@ -215,6 +220,15 @@ IF NOT EXISTS (SELECT data_type FROM information_schema.columns ELSE RAISE NOTICE 'SKIPPING hive_posts upgrade - adding total_votes and net_votes columns'; END IF; + +IF NOT EXISTS(SELECT data_type FROM information_schema.columns + WHERE table_name = 'hive_posts' AND column_name = 'tags_ids') THEN + ALTER TABLE ONLY hive_posts + ADD COLUMN tags_ids INTEGER[]; +ELSE + RAISE NOTICE 'SKIPPING hive_posts upgrade - adding a tags_ids column'; +END IF; + END $BODY$ @@ -390,3 +404,8 @@ DROP INDEX IF EXISTS hive_posts_promoted_idx; CREATE INDEX IF NOT EXISTS hive_posts_promoted_id_idx ON hive_posts (promoted, id) WHERE NOT is_paidout AND counter_deleted = 0 ; + + + CREATE INDEX IF NOT EXISTS hive_posts_tags_ids_idx ON hive_posts USING gin(tags_ids gin__int_ops); + + DROP TABLE IF EXISTS hive_post_tags; diff --git a/hive/indexer/blocks.py b/hive/indexer/blocks.py index 8b5783c2d..2de572b06 100644 --- a/hive/indexer/blocks.py +++ b/hive/indexer/blocks.py @@ -14,7 +14,6 @@ from hive.indexer.payments import Payments from hive.indexer.follow import Follow from hive.indexer.votes import Votes from hive.indexer.post_data_cache import PostDataCache -from hive.indexer.tags import Tags from hive.indexer.reputations import Reputations from hive.indexer.reblog import Reblog from hive.indexer.notify import Notify @@ -49,7 +48,6 @@ class Blocks: ('PostDataCache', PostDataCache.flush, PostDataCache), ('Reputations', Reputations.flush, Reputations), ('Votes', Votes.flush, Votes), - ('Tags', Tags.flush, Tags), ('Follow', Follow.flush, Follow), ('Reblog', Reblog.flush, Reblog), ('Notify', Notify.flush, Notify), @@ -70,7 +68,6 @@ class Blocks: PostDataCache.setup_own_db_access(sharedDbAdapter) Reputations.setup_own_db_access(sharedDbAdapter) Votes.setup_own_db_access(sharedDbAdapter) - Tags.setup_own_db_access(sharedDbAdapter) Follow.setup_own_db_access(sharedDbAdapter) Posts.setup_own_db_access(sharedDbAdapter) Reblog.setup_own_db_access(sharedDbAdapter) @@ -413,7 +410,6 @@ class Blocks: # remove posts: core, tags, cache entries if post_ids: - DB.query("DELETE FROM hive_post_tags WHERE post_id IN :ids", ids=post_ids) DB.query("DELETE FROM hive_posts WHERE id IN :ids", ids=post_ids) DB.query("DELETE FROM hive_post_data WHERE id IN :ids", ids=post_ids) diff --git a/hive/indexer/posts.py b/hive/indexer/posts.py index 08edcbb58..c36811488 100644 --- a/hive/indexer/posts.py +++ b/hive/indexer/posts.py @@ -14,7 +14,6 @@ from hive.indexer.feed_cache import FeedCache from hive.indexer.community import Community from hive.indexer.notify import Notify from hive.indexer.post_data_cache import PostDataCache -from hive.indexer.tags import Tags from hive.indexer.db_adapter_holder import DbAdapterHolder from hive.utils.misc import chunks @@ -152,8 +151,8 @@ class Posts(DbAdapterHolder): from funcy.seqs import distinct tags = list(distinct(tags))[:5] - for tag in tags: - Tags.add_tag(result['id'], tag) + sql = """SELECT add_tags( (:post_id)::INTEGER, (:tags)::VARCHAR[] )""" + DB.query_row( sql, post_id = result['id'], tags=tags ); if not DbState.is_initial_sync(): if error: diff --git a/hive/indexer/tags.py b/hive/indexer/tags.py deleted file mode 100644 index b3b25e82e..000000000 --- a/hive/indexer/tags.py +++ /dev/null @@ -1,74 +0,0 @@ -import logging -from hive.indexer.db_adapter_holder import DbAdapterHolder - -log = logging.getLogger(__name__) - -from hive.utils.normalize import escape_characters - -class Tags(DbAdapterHolder): - """ Tags cache """ - _tags = [] - - @classmethod - def add_tag(cls, tid, tag): - """ Add tag to cache """ - cls._tags.append((tid, tag)) - - @classmethod - def flush(cls): - """ Flush tags to table """ - if cls._tags: - cls.beginTx() - limit = 1000 - - sql = """ - INSERT INTO - hive_tag_data (tag) - VALUES {} - ON CONFLICT DO NOTHING - """ - values = [] - for tag in cls._tags: - values.append("({})".format(escape_characters(tag[1]))) - if len(values) >= limit: - tag_query = str(sql) - cls.db.query(tag_query.format(','.join(values))) - values.clear() - if len(values) > 0: - tag_query = str(sql) - cls.db.query(tag_query.format(','.join(values))) - values.clear() - - sql = """ - INSERT INTO - hive_post_tags (post_id, tag_id) - SELECT - data_source.post_id, data_source.tag_id - FROM - ( - SELECT - post_id, htd.id - FROM - ( - VALUES - {} - ) AS T(post_id, tag) - INNER JOIN hive_tag_data htd ON htd.tag = T.tag - ) AS data_source(post_id, tag_id) - ON CONFLICT DO NOTHING - """ - values = [] - for tag in cls._tags: - values.append("({}, {})".format(tag[0], escape_characters(tag[1]))) - if len(values) >= limit: - tag_query = str(sql) - cls.db.query(tag_query.format(','.join(values))) - values.clear() - if len(values) > 0: - tag_query = str(sql) - cls.db.query(tag_query.format(','.join(values))) - values.clear() - cls.commitTx() - n = len(cls._tags) - cls._tags.clear() - return n diff --git a/tests/tests_api b/tests/tests_api index 0f0fd1af6..1ffd591d3 160000 --- a/tests/tests_api +++ b/tests/tests_api @@ -1 +1 @@ -Subproject commit 0f0fd1af6d7e367849a87443c0137702b135e297 +Subproject commit 1ffd591d38c5e764e8a3910af2d5548d8b28a55b -- GitLab