From 94af61556421cca341849df6c1229e77f470d53b Mon Sep 17 00:00:00 2001 From: Bartek Wrona <wrona@syncad.com> Date: Fri, 19 Jun 2020 12:59:33 +0200 Subject: [PATCH] Improved post importing into DB - eliminated most of additional queries. --- hive/db/schema.py | 100 ++++++++++++++++++++++++++++++++++++++++-- hive/indexer/posts.py | 95 +++++++++++++++++++-------------------- 2 files changed, 141 insertions(+), 54 deletions(-) diff --git a/hive/db/schema.py b/hive/db/schema.py index 0a4f93bae..80f336f37 100644 --- a/hive/db/schema.py +++ b/hive/db/schema.py @@ -374,19 +374,113 @@ def setup(db): sqls = [ "INSERT INTO hive_state (block_num, db_version, steem_per_mvest, usd_per_steem, sbd_per_steem, dgpo) VALUES (0, %d, 0, 0, 0, '')" % DB_VERSION, "INSERT INTO hive_blocks (num, hash, created_at) VALUES (0, '0000000000000000000000000000000000000000', '2016-03-24 16:04:57')", + + "INSERT INTO hive_permlink_data (id, permlink) VALUES (0, '')", + "INSERT INTO hive_category_data (id, category) VALUES (0, '')", + "INSERT INTO hive_accounts (id, name, created_at) VALUES (0, '', '1990-01-01T00:00:00')", + "INSERT INTO hive_accounts (name, created_at) VALUES ('miners', '2016-03-24 16:05:00')", "INSERT INTO hive_accounts (name, created_at) VALUES ('null', '2016-03-24 16:05:00')", "INSERT INTO hive_accounts (name, created_at) VALUES ('temp', '2016-03-24 16:05:00')", "INSERT INTO hive_accounts (name, created_at) VALUES ('initminer', '2016-03-24 16:05:00')", - "INSERT INTO hive_accounts (name, created_at) VALUES ('', '1990-01-01T00:00:00') ON CONFLICT (name) DO NOTHING", - "INSERT INTO hive_permlink_data (permlink) VALUES ('') ON CONFLICT (permlink) DO NOTHING", - "INSERT INTO hive_category_data (category) VALUES ('') ON CONFLICT (category) DO NOTHING"] + + """INSERT INTO public.hive_posts( + id, parent_id, author_id, permlink_id, category_id, community_id, parent_author_id, parent_permlink_id, root_author_id, root_permlink_id, created_at, depth) + VALUES (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, now(), 0);"""] for sql in sqls: db.query(sql) sql = "CREATE INDEX hive_communities_ft1 ON hive_communities USING GIN (to_tsvector('english', title || ' ' || about))" db.query(sql) + sql = """ +DROP FUNCTION if exists add_hive_post(character varying,character varying,character varying,character varying,timestamp without time zone,timestamp without time zone) +; +CREATE OR REPLACE FUNCTION add_hive_post( + in _author hive_accounts.name%TYPE, + in _permlink hive_permlink_data.permlink%TYPE, + in _parent_author hive_accounts.name%TYPE, + in _parent_permlink hive_permlink_data.permlink%TYPE, + in _date hive_posts.created_at%TYPE, + in _community_support_start_date hive_posts.created_at%TYPE) +RETURNS TABLE (id hive_posts.id%TYPE, author_id hive_posts.author_id%TYPE, permlink_id hive_posts.permlink_id%TYPE, + parent_id hive_posts.parent_id%TYPE, community_id hive_posts.community_id%TYPE, + is_valid hive_posts.is_valid%TYPE, is_muted hive_posts.is_muted%TYPE, depth hive_posts.depth%TYPE) +LANGUAGE plpgsql +AS +$function$ +BEGIN + +INSERT INTO hive_permlink_data +(permlink) +values +( +_permlink +) +ON CONFLICT DO NOTHING +; +if _parent_author != '' THEN + RETURN QUERY INSERT INTO hive_posts + (parent_id, parent_author_id, parent_permlink_id, depth, community_id, + category_id, + root_author_id, root_permlink_id, + is_muted, is_valid, + author_id, permlink_id, created_at) + SELECT php.id AS parent_id, php.author_id as parent_author_id, + php.permlink_id as parent_permlink_id, php.depth + 1 as depth, + (CASE + WHEN _date > _community_support_start_date THEN + COALESCE(php.community_id, (select hc.id from hive_communities hc where hc.name = _parent_permlink)) + ELSE NULL + END) as community_id, + COALESCE(php.category_id, (select hcg.id from hive_category_data hcg where hcg.category = _parent_permlink)) as category_id, + php.root_author_id as root_author_id, + php.root_permlink_id as root_permlink_id, + php.is_muted as is_muted, php.is_valid as is_valid, + ha.id as author_id, hpd.id as permlink_id, _date as created_at + FROM hive_accounts ha, + hive_permlink_data hpd, + hive_posts php + INNER JOIN hive_accounts pha ON pha.id = php.author_id + INNER JOIN hive_permlink_data phpd ON phpd.id = php.permlink_id + WHERE pha.name = _parent_author and phpd.permlink = _parent_permlink AND + ha.name = _author and hpd.permlink = _permlink + RETURNING hive_posts.id, hive_posts.author_id, hive_posts.permlink_id, hive_posts.parent_id, hive_posts.community_id, hive_posts.is_valid, hive_posts.is_muted, hive_posts.depth +; +ELSE + INSERT INTO hive_category_data + (category) + VALUES (_parent_permlink) + ON CONFLICT (category) DO NOTHING + ; + + RETURN QUERY INSERT INTO hive_posts + (parent_id, parent_author_id, parent_permlink_id, depth, community_id, + category_id, + root_author_id, root_permlink_id, + is_muted, is_valid, + author_id, permlink_id, created_at) + SELECT 0 AS parent_id, 0 as parent_author_id, 0 as parent_permlink_id, 0 as depth, + (CASE + WHEN _date > _community_support_start_date THEN + (select hc.id from hive_communities hc where hc.name = _parent_permlink) + ELSE NULL + END) as community_id, + (select hcg.id from hive_category_data hcg where hcg.category = _parent_permlink) as category_id, + ha.id as root_author_id, -- use author_id as root one if no parent + hpd.id as root_permlink_id, -- use perlink_id as root one if no parent + false as is_muted, true as is_valid, + ha.id as author_id, hpd.id as permlink_id, _date as created_at + FROM hive_accounts ha, + hive_permlink_data hpd + WHERE ha.name = _author and hpd.permlink = _permlink + RETURNING hive_posts.id, hive_posts.author_id, hive_posts.permlink_id, hive_posts.parent_id, hive_posts.community_id, hive_posts.is_valid, hive_posts.is_muted, hive_posts.depth; +END IF; +END +$function$ + """ +# db.query(sql) + def reset_autovac(db): """Initializes/resets per-table autovacuum/autoanalyze params. diff --git a/hive/indexer/posts.py b/hive/indexer/posts.py index a748aa51c..68d096ead 100644 --- a/hive/indexer/posts.py +++ b/hive/indexer/posts.py @@ -39,12 +39,13 @@ class Posts: ( SELECT id, parent_id, 1 AS level from hive_posts WHERE id = (SELECT hp.id FROM hive_posts hp - LEFT JOIN hive_accounts ha_a ON ha_a.id = hp.author_id - LEFT JOIN hive_permlink_data hpd_p ON hpd_p.id = hp.permlink_id + INNER JOIN hive_accounts ha_a ON ha_a.id = hp.author_id + INNER JOIN hive_permlink_data hpd_p ON hpd_p.id = hp.permlink_id WHERE ha_a.name = :a AND hpd_p.permlink = :p) UNION ALL SELECT t.id, t.parent_id, level + 1 FROM parent INNER JOIN hive_posts t ON t.id = parent.parent_id + WHERE parent.parent_id != 0 ) SELECT id FROM parent ORDER BY level DESC LIMIT 1""" _id = DB.query_one(sql, a=author, p=permlink) @@ -63,8 +64,8 @@ class Posts: sql = """ SELECT hp.id FROM hive_posts hp - LEFT JOIN hive_accounts ha_a ON ha_a.id = hp.author_id - LEFT JOIN hive_permlink_data hpd_p ON hpd_p.id = hp.permlink_id + INNER JOIN hive_accounts ha_a ON ha_a.id = hp.author_id + INNER JOIN hive_permlink_data hpd_p ON hpd_p.id = hp.permlink_id WHERE ha_a.name = :a AND hpd_p.permlink = :p """ _id = DB.query_one(sql, a=author, p=permlink) @@ -199,62 +200,38 @@ class Posts: # * obtain id # * insert post content data - # add permlinks to permlink table - for permlink in ['permlink', 'parent_permlink', 'root_permlink']: - if permlink in op: - sql = """ - INSERT INTO hive_permlink_data (permlink) - VALUES (:permlink) - ON CONFLICT (permlink) DO NOTHING""" - DB.query(sql, permlink=op[permlink]) + sql = """ + SELECT id, author_id, permlink_id, parent_id, community_id, is_valid, is_muted, depth + FROM add_hive_post(:author, :permlink, :parent_author, :parent_permlink, :date, :community_support_start_date); + """ - post = cls._build_post(op, date) + row = DB.query_row(sql, author=op['author'], permlink=op['permlink'], parent_author=op['parent_author'], + parent_permlink=op['parent_permlink'], date=date, community_support_start_date=START_DATE) - # add category to category table - sql = """ - INSERT INTO hive_category_data (category) - VALUES (:category) - ON CONFLICT (category) DO NOTHING""" - DB.query(sql, category=post['category']) + result = dict(row) - sql = """ - INSERT INTO hive_posts (parent_id, author_id, permlink_id, - category_id, community_id, created_at, depth, is_muted, - is_valid, parent_author_id, parent_permlink_id, root_author_id, root_permlink_id) - VALUES (:parent_id, - (SELECT id FROM hive_accounts WHERE name = :author), - (SELECT id FROM hive_permlink_data WHERE permlink = :permlink), - (SELECT id FROM hive_category_data WHERE category = :category), - :community_id, :date, :depth, - :is_muted, :is_valid, - (SELECT id FROM hive_accounts WHERE name = :parent_author), - (SELECT id FROM hive_permlink_data WHERE permlink = :parent_permlink), - (SELECT id FROM hive_accounts WHERE name = :root_author), - (SELECT id FROM hive_permlink_data WHERE permlink = :root_permlink) - )""" - sql += ";SELECT currval(pg_get_serial_sequence('hive_posts','id'))" - - result = DB.query(sql, **post) - post['id'] = int(list(result)[0][0]) - cls._set_id(op['author']+'/'+op['permlink'], post['id']) + # TODO we need to enhance checking related community post validation and honor is_muted. + error = cls._verify_post_against_community(op, result['community_id'], result['is_valid'], result['is_muted']) + + cls._set_id(op['author']+'/'+op['permlink'], result['id']) # add content data to hive_post_data sql = """ INSERT INTO hive_post_data (id, title, preview, img_url, body, json) VALUES (:id, :title, :preview, :img_url, :body, :json)""" - DB.query(sql, id=post['id'], title=op['title'], + DB.query(sql, id=result['id'], title=op['title'], preview=op['preview'] if 'preview' in op else "", img_url=op['img_url'] if 'img_url' in op else "", body=op['body'], json=op['json_metadata'] if op['json_metadata'] else '{}') if not DbState.is_initial_sync(): - if post['error']: - author_id = Accounts.get_id(post['author']) + if error: + author_id = result.author_id Notify('error', dst_id=author_id, when=date, - post_id=post['id'], payload=post['error']).write() + post_id=result['id'], payload=error).write() if op['parent_author']: # update parent's child count - cls.update_child_count(post['parent_id']) - cls._insert_feed_cache(post) + cls.update_child_count(result['parent_id']) + cls._insert_feed_cache4(result['depth'], result['id'], result['author_id'], date) @classmethod def update_child_count(cls, parent_id, op='+'): @@ -455,8 +432,8 @@ class Posts: hpd_p.permlink as permlink FROM hive_posts hp - LEFT JOIN hive_accounts ha_a ON ha_a.id = hp.author_id - LEFT JOIN hive_permlink_data hpd_p ON hpd_p.id = hp.permlink_id + INNER JOIN hive_accounts ha_a ON ha_a.id = hp.author_id + INNER JOIN hive_permlink_data hpd_p ON hpd_p.id = hp.permlink_id WHERE hp.id = (SELECT parent_id FROM hive_posts WHERE id = :child_id)""" result = DB.query_row(sql, child_id=child_id) @@ -468,7 +445,23 @@ class Posts: """Insert the new post into feed cache if it's not a comment.""" if not post['depth']: account_id = Accounts.get_id(post['author']) - FeedCache.insert(post['id'], account_id, post['date']) + _insert_feed_cache4(cls, post['depth'], post['id'], account_id, post['date']) + + @classmethod + def _insert_feed_cache4(cls, post_depth, post_id, author_id, post_date): + """Insert the new post into feed cache if it's not a comment.""" + if not post_depth: + FeedCache.insert(post_id, account_id, post_date) + + + @classmethod + def _verify_post_against_community(cls, op, community_id, is_valid, is_muted): + error = None + if community_id and is_valid and not Community.is_post_valid(community_id, op): + error = 'not authorized' + #is_valid = False # TODO: reserved for future blacklist status? + is_muted = True + return error @classmethod def _build_post(cls, op, date, pid=None): @@ -503,7 +496,7 @@ class Posts: sql = """ SELECT depth, hcd.category as category, community_id, is_valid, is_muted FROM hive_posts hp - LEFT JOIN hive_category_data hcd ON hcd.id = hp.category_id + INNER JOIN hive_category_data hcd ON hcd.id = hp.category_id WHERE hp.id = :id""" (parent_depth, category, community_id, is_valid, is_muted) = DB.query_row(sql, id=parent_id) @@ -517,8 +510,8 @@ class Posts: ha_a.name as author, hpd_p.permlink as permlink FROM hive_posts hp - LEFT JOIN hive_accounts ha_a ON ha_a.id = hp.author_id - LEFT JOIN hive_permlink_data hpd_p ON hpd_p.id = hp.permlink_id + INNER JOIN hive_accounts ha_a ON ha_a.id = hp.author_id + INNER JOIN hive_permlink_data hpd_p ON hpd_p.id = hp.permlink_id WHERE hp.id = :id""" root_author, root_permlink = DB.query_row(sql, id=root_id) -- GitLab