From 3e8b1b76688cf3bcb55f31ce4452d5d15316d426 Mon Sep 17 00:00:00 2001 From: Marcin Ickiewicz <mickiewicz@syncad.com> Date: Tue, 17 Nov 2020 15:55:52 +0100 Subject: [PATCH] indexing tags only in sql --- hive/db/sql_scripts/hive_post_operations.sql | 76 +++++++++++--------- hive/indexer/posts.py | 40 +++++------ 2 files changed, 60 insertions(+), 56 deletions(-) diff --git a/hive/db/sql_scripts/hive_post_operations.sql b/hive/db/sql_scripts/hive_post_operations.sql index 26715bb46..4636957ca 100644 --- a/hive/db/sql_scripts/hive_post_operations.sql +++ b/hive/db/sql_scripts/hive_post_operations.sql @@ -1,4 +1,31 @@ -DROP FUNCTION if exists process_hive_post_operation(character varying,character varying,character varying,character varying,timestamp without time zone,timestamp without time zone) +DROP FUNCTION IF EXISTS prepare_tags; +CREATE OR REPLACE FUNCTION prepare_tags( in _raw_tags VARCHAR[] ) +RETURNS SETOF hive_tag_data.id%TYPE +LANGUAGE 'plpgsql' +VOLATILE +AS +$function$ +DECLARE + __i INTEGER; + __tags VARCHAR[]; + __tag TEXT; +BEGIN + FOR __i IN 1 .. ARRAY_UPPER( _raw_tags, 1) + LOOP + __tag = LEFT(LOWER(REGEXP_REPLACE( _raw_tags[ __i ], '[#\s]', '', 'g' )),32); + CONTINUE WHEN __tag = '' OR __tags @> ARRAY_APPEND(ARRAY[]::VARCHAR[], CAST(__tag as VARCHAR) ); + __tags = ARRAY_APPEND( __tags, CAST(__tag as VARCHAR) ); + END LOOP; + + RETURN QUERY INSERT INTO + hive_tag_data AS htd(tag) + SELECT UNNEST( __tags ) + ON CONFLICT("tag") DO UPDATE SET tag=EXCLUDED.tag --trick to always return id + RETURNING htd.id; +END +$function$; + +DROP FUNCTION IF EXISTS process_hive_post_operation; ; CREATE OR REPLACE FUNCTION process_hive_post_operation( in _author hive_accounts.name%TYPE, @@ -7,7 +34,9 @@ CREATE OR REPLACE FUNCTION process_hive_post_operation( in _parent_permlink hive_permlink_data.permlink%TYPE, in _date hive_posts.created_at%TYPE, in _community_support_start_date hive_posts.created_at%TYPE, - in _block_num hive_posts.block_num%TYPE) + in _block_num hive_posts.block_num%TYPE, + in _metadata_tags VARCHAR[] +) RETURNS TABLE (is_new_post boolean, id hive_posts.id%TYPE, author_id hive_posts.author_id%TYPE, permlink_id hive_posts.permlink_id%TYPE, post_category hive_category_data.category%TYPE, parent_id hive_posts.parent_id%TYPE, community_id hive_posts.community_id%TYPE, is_valid hive_posts.is_valid%TYPE, is_muted hive_posts.is_muted%TYPE, depth hive_posts.depth%TYPE) @@ -28,7 +57,9 @@ if _parent_author != '' THEN RETURN QUERY INSERT INTO hive_posts as hp (parent_id, depth, community_id, category_id, root_id, is_muted, is_valid, - author_id, permlink_id, created_at, updated_at, sc_hot, sc_trend, active, payout_at, cashout_time, counter_deleted, block_num, block_num_created) + author_id, permlink_id, created_at, updated_at, sc_hot, sc_trend, + active, payout_at, cashout_time, counter_deleted, block_num, block_num_created + ) SELECT php.id AS parent_id, php.depth + 1 AS depth, (CASE WHEN _date > _community_support_start_date THEN @@ -74,7 +105,9 @@ ELSE RETURN QUERY INSERT INTO hive_posts as hp (parent_id, depth, community_id, category_id, root_id, is_muted, is_valid, - author_id, permlink_id, created_at, updated_at, sc_hot, sc_trend, active, payout_at, cashout_time, counter_deleted, block_num, block_num_created) + author_id, permlink_id, created_at, updated_at, sc_hot, sc_trend, + active, payout_at, cashout_time, counter_deleted, block_num, block_num_created, + tags_ids) SELECT 0 AS parent_id, 0 AS depth, (CASE WHEN _date > _community_support_start_date THEN @@ -90,8 +123,12 @@ ELSE calculate_time_part_of_trending(_date) AS sc_trend, _date AS active, (_date + INTERVAL '7 days') AS payout_at, (_date + INTERVAL '7 days') AS cashout_time, 0 , _block_num as block_num, _block_num as block_num_created + , ( + SELECT ARRAY_AGG( prepare_tags ) + FROM prepare_tags( ARRAY_APPEND(_metadata_tags, _parent_permlink ) ) + ) as tags_ids FROM hive_accounts ha, - hive_permlink_data hpd + hive_permlink_data hpd WHERE ha.name = _author and hpd.permlink = _permlink ON CONFLICT ON CONSTRAINT hive_posts_ux1 DO UPDATE SET @@ -100,7 +137,8 @@ ELSE --- post edit part updated_at = _date, active = _date, - block_num = _block_num + block_num = _block_num, + tags_ids = EXCLUDED.tags_ids RETURNING (xmax = 0) as is_new_post, hp.id, hp.author_id, hp.permlink_id, _parent_permlink as post_category, hp.parent_id, hp.community_id, hp.is_valid, hp.is_muted, hp.depth ; @@ -138,29 +176,3 @@ BEGIN END $function$ ; - -DROP FUNCTION IF EXISTS add_tags; -CREATE FUNCTION add_tags( in _post_id hive_posts.id%TYPE, in _tags VARCHAR[] ) -RETURNS void -LANGUAGE 'plpgsql' -VOLATILE -AS -$function$ -DECLARE - __tags_ids INTEGER[]; -BEGIN - WITH tags_ids(id) AS - ( - INSERT INTO - hive_tag_data AS htd(tag) - SELECT UNNEST( _tags ) - ON CONFLICT("tag") DO UPDATE SET tag=EXCLUDED.tag --trick to always return id - RETURNING htd.id - ) - SELECT ARRAY_AGG( id ) INTO __tags_ids FROM tags_ids; - - UPDATE hive_posts hp - SET tags_ids = __tags_ids - WHERE hp.id = _post_id; -END -$function$ diff --git a/hive/indexer/posts.py b/hive/indexer/posts.py index 04ab27f21..35180f034 100644 --- a/hive/indexer/posts.py +++ b/hive/indexer/posts.py @@ -89,13 +89,27 @@ class Posts(DbAdapterHolder): def comment_op(cls, op, block_date): """Register new/edited/undeleted posts; insert into feed cache.""" + md = {} + # At least one case where jsonMetadata was double-encoded: condenser#895 + # jsonMetadata = JSON.parse(jsonMetadata); + try: + md = loads(op['json_metadata']) + if not isinstance(md, dict): + md = {} + except Exception: + pass + + tags = [] + if md and 'tags' in md and isinstance(md['tags'], list): + tags = md['tags'] + sql = """ SELECT is_new_post, id, author_id, permlink_id, post_category, parent_id, community_id, is_valid, is_muted, depth - FROM process_hive_post_operation((:author)::varchar, (:permlink)::varchar, (:parent_author)::varchar, (:parent_permlink)::varchar, (:date)::timestamp, (:community_support_start_date)::timestamp, (:block_num)::integer); + FROM process_hive_post_operation((:author)::varchar, (:permlink)::varchar, (:parent_author)::varchar, (:parent_permlink)::varchar, (:date)::timestamp, (:community_support_start_date)::timestamp, (:block_num)::integer, (:tags)::VARCHAR[]); """ row = DB.query_row(sql, author=op['author'], permlink=op['permlink'], parent_author=op['parent_author'], - parent_permlink=op['parent_permlink'], date=block_date, community_support_start_date=START_DATE, block_num=op['block_num']) + parent_permlink=op['parent_permlink'], date=block_date, community_support_start_date=START_DATE, block_num=op['block_num'], tags=tags) result = dict(row) @@ -104,16 +118,6 @@ class Posts(DbAdapterHolder): cls._set_id(op['author']+'/'+op['permlink'], result['id']) - md = {} - # At least one case where jsonMetadata was double-encoded: condenser#895 - # jsonMetadata = JSON.parse(jsonMetadata); - try: - md = loads(op['json_metadata']) - if not isinstance(md, dict): - md = {} - except Exception: - pass - img_url = None if 'image' in md: img_url = md['image'] @@ -142,18 +146,6 @@ class Posts(DbAdapterHolder): # log.info("Adding author: {} permlink: {}".format(op['author'], op['permlink'])) PostDataCache.add_data(result['id'], post_data, is_new_post) - if not result['depth']: - tags = [result['post_category']] - if md and 'tags' in md and isinstance(md['tags'], list): - tags = tags + md['tags'] - tags = map(lambda tag: (str(tag) or '').strip('# ').lower()[:32], tags) - tags = filter(None, tags) - from funcy.seqs import distinct - tags = list(distinct(tags))[:5] - - sql = """SELECT add_tags( (:post_id)::INTEGER, (:tags)::VARCHAR[] )""" - DB.query_row( sql, post_id = result['id'], tags=tags ); - if not DbState.is_initial_sync(): if error: author_id = result['author_id'] -- GitLab