From 3a23e2e3fe90301d993a2dae6072428e279364ac Mon Sep 17 00:00:00 2001 From: Dariusz Kedzierski <dkedzierski@syncad.com> Date: Wed, 24 Jun 2020 16:25:38 +0200 Subject: [PATCH] Tag support --- hive/db/schema.py | 13 +++++++-- hive/indexer/blocks.py | 3 ++ hive/indexer/post_data_cache.py | 1 + hive/indexer/posts.py | 23 ++++++++++++++- hive/indexer/tags.py | 42 ++++++++++++++++++++++++++++ hive/server/bridge_api/cursor.py | 2 +- hive/server/bridge_api/methods.py | 2 +- hive/server/condenser_api/cursor.py | 2 +- hive/server/condenser_api/methods.py | 2 +- 9 files changed, 83 insertions(+), 7 deletions(-) create mode 100644 hive/indexer/tags.py diff --git a/hive/db/schema.py b/hive/db/schema.py index 7dc9ff1a1..d58d383b7 100644 --- a/hive/db/schema.py +++ b/hive/db/schema.py @@ -209,10 +209,19 @@ def build_metadata(): sa.Index('hive_votes_downvote_idx', 'vote_percent', postgresql_where=sql_text("vote_percent < 0")) ) + sa.Table( + 'hive_tag_data', metadata, + sa.Column('id', sa.Integer, nullable=False, primary_key=True), + sa.Column('tag', VARCHAR(32), nullable=False, server_default=''), + sa.UniqueConstraint('tag', name='hive_tag_data_ux1') + ) + sa.Table( 'hive_post_tags', metadata, - sa.Column('post_id', sa.Integer, nullable=False, primary_key=True), - sa.Column('tag', sa.String(32), nullable=False), + sa.Column('post_id', sa.Integer, nullable=False), + sa.Column('tag_id', sa.Integer, nullable=False), + sa.ForeignKeyConstraint(['post_id'], ['hive_posts.id']), + sa.ForeignKeyConstraint(['tag_id'], ['hive_tag_data.id']), ) sa.Table( diff --git a/hive/indexer/blocks.py b/hive/indexer/blocks.py index d71bc0957..9013190a9 100644 --- a/hive/indexer/blocks.py +++ b/hive/indexer/blocks.py @@ -11,6 +11,7 @@ from hive.indexer.payments import Payments from hive.indexer.follow import Follow from hive.indexer.votes import Votes from hive.indexer.post_data_cache import PostDataCache +from hive.indexer.tags import Tags from time import perf_counter log = logging.getLogger(__name__) @@ -52,6 +53,7 @@ class Blocks: #assert is_trx_active(), "Block.process must be in a trx" ret = cls._process(block, vops_in_block, hived, is_initial_sync=False) PostDataCache.flush() + Tags.flush() Votes.flush() time_end = perf_counter() log.info("[PROCESS BLOCK] %fs", time_end - time_start) @@ -75,6 +77,7 @@ class Blocks: # expensive. So is tracking follows at all; hence we track # deltas in memory and update follow/er counts in bulk. PostDataCache.flush() + Tags.flush() Votes.flush() cls._flush_blocks() Follow.flush(trx=False) diff --git a/hive/indexer/post_data_cache.py b/hive/indexer/post_data_cache.py index 83514c05a..905afc3e8 100644 --- a/hive/indexer/post_data_cache.py +++ b/hive/indexer/post_data_cache.py @@ -5,6 +5,7 @@ log = logging.getLogger(__name__) DB = Db.instance() def escape_characters(text): + """ Escape special charactes """ ret = str(text) ret = ret.replace("\\", "\\\\") ret = ret.replace("'", "''") diff --git a/hive/indexer/posts.py b/hive/indexer/posts.py index d060fe44c..78322c5c2 100644 --- a/hive/indexer/posts.py +++ b/hive/indexer/posts.py @@ -3,7 +3,7 @@ import logging import collections -from json import dumps +from json import dumps, loads from hive.db.adapter import Db from hive.db.db_state import DbState @@ -13,6 +13,7 @@ from hive.indexer.feed_cache import FeedCache from hive.indexer.community import Community, START_DATE from hive.indexer.notify import Notify from hive.indexer.post_data_cache import PostDataCache +from hive.indexer.tags import Tags from hive.utils.normalize import legacy_amount, asset_to_hbd_hive log = logging.getLogger(__name__) @@ -113,6 +114,26 @@ class Posts: json=op['json_metadata'] if op['json_metadata'] else '{}') PostDataCache.add_data(result['id'], post_data) + md = {} + # At least one case where jsonMetadata was double-encoded: condenser#895 + # jsonMetadata = JSON.parse(jsonMetadata); + try: + md = loads(op['json_metadata']) + if not isinstance(md, dict): + md = {} + except Exception: + pass + + tags = [op['parent_permlink']] + if md and 'tags' in md and isinstance(md['tags'], list): + tags = tags + md['tags'] + tags = map(lambda tag: (str(tag) or '').strip('# ').lower()[:32], tags) + tags = filter(None, tags) + from funcy.seqs import distinct + tags = list(distinct(tags))[:5] + for tag in tags: + Tags.add_tag(result['id'], tag) + if not DbState.is_initial_sync(): if error: author_id = result['author_id'] diff --git a/hive/indexer/tags.py b/hive/indexer/tags.py new file mode 100644 index 000000000..5c258dc8f --- /dev/null +++ b/hive/indexer/tags.py @@ -0,0 +1,42 @@ +import logging +from hive.db.adapter import Db + +log = logging.getLogger(__name__) +DB = Db.instance() + +class Tags(object): + """ Tags cache """ + _tags = [] + + @classmethod + def add_tag(cls, tid, tag): + """ Add tag to cache """ + cls._tags.append((tid, tag)) + + @classmethod + def flush(cls): + """ Flush tags to table """ + if cls._tags: + sql = """ + INSERT INTO + hive_tag_data (tag) + VALUES + """ + values = [] + for tag in cls._tags: + values.append("('{}')".format(tag[1])) + sql += ",".join(values) + sql += " ON CONFLICT DO NOTHING;" + + sql += """ + INSERT INTO + hive_post_tags (post_id, tag_id) + VALUES + """ + values = [] + for tag in cls._tags: + values.append("({}, (SELECT id FROM hive_tag_data WHERE tag='{}'))".format(tag[0], tag[1])) + sql += ",".join(values) + sql += " ON CONFLICT DO NOTHING" + DB.query(sql) + cls._tags.clear() diff --git a/hive/server/bridge_api/cursor.py b/hive/server/bridge_api/cursor.py index fbf884d81..c65baf416 100644 --- a/hive/server/bridge_api/cursor.py +++ b/hive/server/bridge_api/cursor.py @@ -180,7 +180,7 @@ async def pids_by_category(db, tag, sort, last_id, limit): if sort in ['payout', 'payout_comments']: where.append('category_id = (SELECT id FROM hive_category_data WHERE category = :tag)') else: - sql = "SELECT post_id FROM hive_post_tags WHERE tag = :tag" + sql = "SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag)" where.append("id IN (%s)" % sql) if last_id: diff --git a/hive/server/bridge_api/methods.py b/hive/server/bridge_api/methods.py index 11d1ef1a0..d473ff1cb 100644 --- a/hive/server/bridge_api/methods.py +++ b/hive/server/bridge_api/methods.py @@ -191,7 +191,7 @@ async def get_ranked_posts(context, sort, start_author='', start_permlink='', if sort in ['payout', 'payout_comments']: sql = sql % """ AND hp.category = :tag """ else: - sql = sql % """ AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag = :tag)""" + sql = sql % """ AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag))""" if not observer: observer = '' diff --git a/hive/server/condenser_api/cursor.py b/hive/server/condenser_api/cursor.py index f5cbaf87a..980fbc1c8 100644 --- a/hive/server/condenser_api/cursor.py +++ b/hive/server/condenser_api/cursor.py @@ -175,7 +175,7 @@ async def pids_by_query(db, sort, start_author, start_permlink, limit, tag): where.append('category_id = (SELECT id FROM hive_category_data WHERE category = :tag)') if sort in ('trending', 'hot'): where.append('depth = 0') - sql = "SELECT post_id FROM hive_post_tags WHERE tag = :tag" + sql = "SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag)" where.append("id IN (%s)" % sql) start_id = None diff --git a/hive/server/condenser_api/methods.py b/hive/server/condenser_api/methods.py index d6a104a38..00afdeba6 100644 --- a/hive/server/condenser_api/methods.py +++ b/hive/server/condenser_api/methods.py @@ -251,7 +251,7 @@ async def get_discussions_by(discussion_type, context, start_author: str = '', if tag[:5] == 'hive-': sql = sql % """ %s AND hp.category = :tag """ else: - sql = sql % """ %s AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag = :tag) """ + sql = sql % """ %s AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag)) """ if start_author and start_permlink: if discussion_type == 'trending': -- GitLab