From 76ef6deaae9163c932f18734273a8107cd1a4b05 Mon Sep 17 00:00:00 2001 From: Dariusz Kedzierski <dkedzierski@syncad.com> Date: Wed, 24 Jun 2020 19:44:40 +0200 Subject: [PATCH] - In tags: * added query limit to avoid SQL overflow errors, * fixed inefficient insert query, pervious was using selects for each row, - In apis: * subqueries were changed to inner joins --- hive/indexer/tags.py | 44 ++++++++++++++++++++++------ hive/server/bridge_api/cursor.py | 9 +++++- hive/server/bridge_api/methods.py | 10 ++++++- hive/server/condenser_api/cursor.py | 9 +++++- hive/server/condenser_api/methods.py | 9 +++++- 5 files changed, 68 insertions(+), 13 deletions(-) diff --git a/hive/indexer/tags.py b/hive/indexer/tags.py index f60165cda..e63d198e0 100644 --- a/hive/indexer/tags.py +++ b/hive/indexer/tags.py @@ -19,27 +19,53 @@ class Tags(object): def flush(cls): """ Flush tags to table """ if cls._tags: + limit = 1000 + sql = """ INSERT INTO hive_tag_data (tag) - VALUES + VALUES {} + ON CONFLICT DO NOTHING """ values = [] for tag in cls._tags: values.append("('{}')".format(escape_characters(tag[1]))) - sql += ",".join(values) - sql += " ON CONFLICT DO NOTHING" - DB.query(sql) + if len(values) >= limit: + tag_query = str(sql) + DB.query(tag_query.format(','.join(values))) + values.clear() + if len(values) > 0: + tag_query = str(sql) + DB.query(tag_query.format(','.join(values))) + values.clear() sql = """ INSERT INTO hive_post_tags (post_id, tag_id) - VALUES + SELECT + data_source.post_id, data_source.tag_id + FROM + ( + SELECT + post_id, htd.id + FROM + ( + VALUES + {} + ) AS T(post_id, tag) + INNER JOIN hive_tag_data htd ON htd.tag = T.tag + ) AS data_source(post_id, tag_id) + ON CONFLICT DO NOTHING """ values = [] for tag in cls._tags: - values.append("({}, (SELECT id FROM hive_tag_data WHERE tag='{}'))".format(tag[0], escape_characters(tag[1]))) - sql += ",".join(values) - sql += " ON CONFLICT DO NOTHING" - DB.query(sql) + values.append("({}, '{}')".format(tag[0], escape_characters(tag[1]))) + if len(values) >= limit: + tag_query = str(sql) + DB.query(tag_query.format(','.join(values))) + values.clear() + if len(values) > 0: + tag_query = str(sql) + DB.query(tag_query.format(','.join(values))) + values.clear() cls._tags.clear() diff --git a/hive/server/bridge_api/cursor.py b/hive/server/bridge_api/cursor.py index c65baf416..6a0fc86bb 100644 --- a/hive/server/bridge_api/cursor.py +++ b/hive/server/bridge_api/cursor.py @@ -180,7 +180,14 @@ async def pids_by_category(db, tag, sort, last_id, limit): if sort in ['payout', 'payout_comments']: where.append('category_id = (SELECT id FROM hive_category_data WHERE category = :tag)') else: - sql = "SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag)" + sql = """ + SELECT + post_id + FROM + hive_post_tags hpt + INNER JOIN hive_tag_data htd ON hpt.tag_id=htp.id + WHERE htd.tag = :tag + """ where.append("id IN (%s)" % sql) if last_id: diff --git a/hive/server/bridge_api/methods.py b/hive/server/bridge_api/methods.py index d473ff1cb..8f10fd0fb 100644 --- a/hive/server/bridge_api/methods.py +++ b/hive/server/bridge_api/methods.py @@ -191,7 +191,15 @@ async def get_ranked_posts(context, sort, start_author='', start_permlink='', if sort in ['payout', 'payout_comments']: sql = sql % """ AND hp.category = :tag """ else: - sql = sql % """ AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag))""" + sql = sql % """ AND hp.post_id IN + (SELECT + post_id + FROM + hive_post_tags hpt + INNER JOIN hive_tag_data htd ON hpt.tag_id=htp.id + WHERE htd.tag = :tag + ) + """ if not observer: observer = '' diff --git a/hive/server/condenser_api/cursor.py b/hive/server/condenser_api/cursor.py index 980fbc1c8..006a47d4e 100644 --- a/hive/server/condenser_api/cursor.py +++ b/hive/server/condenser_api/cursor.py @@ -175,7 +175,14 @@ async def pids_by_query(db, sort, start_author, start_permlink, limit, tag): where.append('category_id = (SELECT id FROM hive_category_data WHERE category = :tag)') if sort in ('trending', 'hot'): where.append('depth = 0') - sql = "SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag)" + sql = """ + SELECT + post_id + FROM + hive_post_tags hpt + INNER JOIN hive_tag_data htd ON hpt.tag_id=htp.id + WHERE htd.tag = :tag + """ where.append("id IN (%s)" % sql) start_id = None diff --git a/hive/server/condenser_api/methods.py b/hive/server/condenser_api/methods.py index 00afdeba6..a6a0ba05d 100644 --- a/hive/server/condenser_api/methods.py +++ b/hive/server/condenser_api/methods.py @@ -251,7 +251,14 @@ async def get_discussions_by(discussion_type, context, start_author: str = '', if tag[:5] == 'hive-': sql = sql % """ %s AND hp.category = :tag """ else: - sql = sql % """ %s AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag)) """ + sql = sql % """ %s AND hp.post_id IN + (SELECT + post_id + FROM + hive_post_tags hpt + INNER JOIN hive_tag_data htd ON hpt.tag_id=htp.id + WHERE htd.tag = :tag + ) """ if start_author and start_permlink: if discussion_type == 'trending': -- GitLab