diff --git a/hive/db/schema.py b/hive/db/schema.py index 7dc9ff1a187e0ae6b5f33f0912b4b3f63ed66d04..d58d383b7ac455098bcff191c36d643093d03b50 100644 --- a/hive/db/schema.py +++ b/hive/db/schema.py @@ -209,10 +209,19 @@ def build_metadata(): sa.Index('hive_votes_downvote_idx', 'vote_percent', postgresql_where=sql_text("vote_percent < 0")) ) + sa.Table( + 'hive_tag_data', metadata, + sa.Column('id', sa.Integer, nullable=False, primary_key=True), + sa.Column('tag', VARCHAR(32), nullable=False, server_default=''), + sa.UniqueConstraint('tag', name='hive_tag_data_ux1') + ) + sa.Table( 'hive_post_tags', metadata, - sa.Column('post_id', sa.Integer, nullable=False, primary_key=True), - sa.Column('tag', sa.String(32), nullable=False), + sa.Column('post_id', sa.Integer, nullable=False), + sa.Column('tag_id', sa.Integer, nullable=False), + sa.ForeignKeyConstraint(['post_id'], ['hive_posts.id']), + sa.ForeignKeyConstraint(['tag_id'], ['hive_tag_data.id']), ) sa.Table( diff --git a/hive/indexer/blocks.py b/hive/indexer/blocks.py index d71bc09573b1d641d8a3491d95c8c570046eeb57..9013190a9bb868460c200dc552354b9eed42e6d5 100644 --- a/hive/indexer/blocks.py +++ b/hive/indexer/blocks.py @@ -11,6 +11,7 @@ from hive.indexer.payments import Payments from hive.indexer.follow import Follow from hive.indexer.votes import Votes from hive.indexer.post_data_cache import PostDataCache +from hive.indexer.tags import Tags from time import perf_counter log = logging.getLogger(__name__) @@ -52,6 +53,7 @@ class Blocks: #assert is_trx_active(), "Block.process must be in a trx" ret = cls._process(block, vops_in_block, hived, is_initial_sync=False) PostDataCache.flush() + Tags.flush() Votes.flush() time_end = perf_counter() log.info("[PROCESS BLOCK] %fs", time_end - time_start) @@ -75,6 +77,7 @@ class Blocks: # expensive. So is tracking follows at all; hence we track # deltas in memory and update follow/er counts in bulk. PostDataCache.flush() + Tags.flush() Votes.flush() cls._flush_blocks() Follow.flush(trx=False) diff --git a/hive/indexer/post_data_cache.py b/hive/indexer/post_data_cache.py index 83514c05a1d004dd2e5991eab3679b0b66e1cb4b..905afc3e8baea64a11f1c201c6c71676a33130b6 100644 --- a/hive/indexer/post_data_cache.py +++ b/hive/indexer/post_data_cache.py @@ -5,6 +5,7 @@ log = logging.getLogger(__name__) DB = Db.instance() def escape_characters(text): + """ Escape special charactes """ ret = str(text) ret = ret.replace("\\", "\\\\") ret = ret.replace("'", "''") diff --git a/hive/indexer/posts.py b/hive/indexer/posts.py index d060fe44cd54ec3f58092cafb4fcafeb227d690c..78322c5c23bbccff8fd7df4b47c9d827d5968bcb 100644 --- a/hive/indexer/posts.py +++ b/hive/indexer/posts.py @@ -3,7 +3,7 @@ import logging import collections -from json import dumps +from json import dumps, loads from hive.db.adapter import Db from hive.db.db_state import DbState @@ -13,6 +13,7 @@ from hive.indexer.feed_cache import FeedCache from hive.indexer.community import Community, START_DATE from hive.indexer.notify import Notify from hive.indexer.post_data_cache import PostDataCache +from hive.indexer.tags import Tags from hive.utils.normalize import legacy_amount, asset_to_hbd_hive log = logging.getLogger(__name__) @@ -113,6 +114,26 @@ class Posts: json=op['json_metadata'] if op['json_metadata'] else '{}') PostDataCache.add_data(result['id'], post_data) + md = {} + # At least one case where jsonMetadata was double-encoded: condenser#895 + # jsonMetadata = JSON.parse(jsonMetadata); + try: + md = loads(op['json_metadata']) + if not isinstance(md, dict): + md = {} + except Exception: + pass + + tags = [op['parent_permlink']] + if md and 'tags' in md and isinstance(md['tags'], list): + tags = tags + md['tags'] + tags = map(lambda tag: (str(tag) or '').strip('# ').lower()[:32], tags) + tags = filter(None, tags) + from funcy.seqs import distinct + tags = list(distinct(tags))[:5] + for tag in tags: + Tags.add_tag(result['id'], tag) + if not DbState.is_initial_sync(): if error: author_id = result['author_id'] diff --git a/hive/indexer/tags.py b/hive/indexer/tags.py new file mode 100644 index 0000000000000000000000000000000000000000..5c258dc8f550fe84e10099d1c8d3626e39b28884 --- /dev/null +++ b/hive/indexer/tags.py @@ -0,0 +1,42 @@ +import logging +from hive.db.adapter import Db + +log = logging.getLogger(__name__) +DB = Db.instance() + +class Tags(object): + """ Tags cache """ + _tags = [] + + @classmethod + def add_tag(cls, tid, tag): + """ Add tag to cache """ + cls._tags.append((tid, tag)) + + @classmethod + def flush(cls): + """ Flush tags to table """ + if cls._tags: + sql = """ + INSERT INTO + hive_tag_data (tag) + VALUES + """ + values = [] + for tag in cls._tags: + values.append("('{}')".format(tag[1])) + sql += ",".join(values) + sql += " ON CONFLICT DO NOTHING;" + + sql += """ + INSERT INTO + hive_post_tags (post_id, tag_id) + VALUES + """ + values = [] + for tag in cls._tags: + values.append("({}, (SELECT id FROM hive_tag_data WHERE tag='{}'))".format(tag[0], tag[1])) + sql += ",".join(values) + sql += " ON CONFLICT DO NOTHING" + DB.query(sql) + cls._tags.clear() diff --git a/hive/server/bridge_api/cursor.py b/hive/server/bridge_api/cursor.py index fbf884d81991c6e7e55675b883ffa095e27fb36c..c65baf4163fa499774cef44dfad5e3b94dc7db3b 100644 --- a/hive/server/bridge_api/cursor.py +++ b/hive/server/bridge_api/cursor.py @@ -180,7 +180,7 @@ async def pids_by_category(db, tag, sort, last_id, limit): if sort in ['payout', 'payout_comments']: where.append('category_id = (SELECT id FROM hive_category_data WHERE category = :tag)') else: - sql = "SELECT post_id FROM hive_post_tags WHERE tag = :tag" + sql = "SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag)" where.append("id IN (%s)" % sql) if last_id: diff --git a/hive/server/bridge_api/methods.py b/hive/server/bridge_api/methods.py index 11d1ef1a00c778e9d6f173c42cff29e22ebbf0a8..d473ff1cb0f8dd3c5e8ba73df9bf5f7fca47706f 100644 --- a/hive/server/bridge_api/methods.py +++ b/hive/server/bridge_api/methods.py @@ -191,7 +191,7 @@ async def get_ranked_posts(context, sort, start_author='', start_permlink='', if sort in ['payout', 'payout_comments']: sql = sql % """ AND hp.category = :tag """ else: - sql = sql % """ AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag = :tag)""" + sql = sql % """ AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag))""" if not observer: observer = '' diff --git a/hive/server/condenser_api/cursor.py b/hive/server/condenser_api/cursor.py index f5cbaf87aae911741930ef1a26c6d336f614cfd5..980fbc1c8f10857f80efc7eb2aad7e8ac390cad5 100644 --- a/hive/server/condenser_api/cursor.py +++ b/hive/server/condenser_api/cursor.py @@ -175,7 +175,7 @@ async def pids_by_query(db, sort, start_author, start_permlink, limit, tag): where.append('category_id = (SELECT id FROM hive_category_data WHERE category = :tag)') if sort in ('trending', 'hot'): where.append('depth = 0') - sql = "SELECT post_id FROM hive_post_tags WHERE tag = :tag" + sql = "SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag)" where.append("id IN (%s)" % sql) start_id = None diff --git a/hive/server/condenser_api/methods.py b/hive/server/condenser_api/methods.py index d6a104a38cf3afc8402ac215dee91a1bdfa6520c..00afdeba603399fe4a9bbbd8ee7681ff8483b2ab 100644 --- a/hive/server/condenser_api/methods.py +++ b/hive/server/condenser_api/methods.py @@ -251,7 +251,7 @@ async def get_discussions_by(discussion_type, context, start_author: str = '', if tag[:5] == 'hive-': sql = sql % """ %s AND hp.category = :tag """ else: - sql = sql % """ %s AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag = :tag) """ + sql = sql % """ %s AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag)) """ if start_author and start_permlink: if discussion_type == 'trending':