From 3a23e2e3fe90301d993a2dae6072428e279364ac Mon Sep 17 00:00:00 2001
From: Dariusz Kedzierski <dkedzierski@syncad.com>
Date: Wed, 24 Jun 2020 16:25:38 +0200
Subject: [PATCH] Tag support

---
 hive/db/schema.py                    | 13 +++++++--
 hive/indexer/blocks.py               |  3 ++
 hive/indexer/post_data_cache.py      |  1 +
 hive/indexer/posts.py                | 23 ++++++++++++++-
 hive/indexer/tags.py                 | 42 ++++++++++++++++++++++++++++
 hive/server/bridge_api/cursor.py     |  2 +-
 hive/server/bridge_api/methods.py    |  2 +-
 hive/server/condenser_api/cursor.py  |  2 +-
 hive/server/condenser_api/methods.py |  2 +-
 9 files changed, 83 insertions(+), 7 deletions(-)
 create mode 100644 hive/indexer/tags.py

diff --git a/hive/db/schema.py b/hive/db/schema.py
index 7dc9ff1a1..d58d383b7 100644
--- a/hive/db/schema.py
+++ b/hive/db/schema.py
@@ -209,10 +209,19 @@ def build_metadata():
         sa.Index('hive_votes_downvote_idx', 'vote_percent', postgresql_where=sql_text("vote_percent < 0"))
     )
 
+    sa.Table(
+        'hive_tag_data', metadata,
+        sa.Column('id', sa.Integer, nullable=False, primary_key=True),
+        sa.Column('tag', VARCHAR(32), nullable=False, server_default=''),
+        sa.UniqueConstraint('tag', name='hive_tag_data_ux1')
+    )
+
     sa.Table(
         'hive_post_tags', metadata,
-        sa.Column('post_id', sa.Integer, nullable=False, primary_key=True),
-        sa.Column('tag', sa.String(32), nullable=False),
+        sa.Column('post_id', sa.Integer, nullable=False),
+        sa.Column('tag_id', sa.Integer, nullable=False),
+        sa.ForeignKeyConstraint(['post_id'], ['hive_posts.id']),
+        sa.ForeignKeyConstraint(['tag_id'], ['hive_tag_data.id']),
     )
 
     sa.Table(
diff --git a/hive/indexer/blocks.py b/hive/indexer/blocks.py
index d71bc0957..9013190a9 100644
--- a/hive/indexer/blocks.py
+++ b/hive/indexer/blocks.py
@@ -11,6 +11,7 @@ from hive.indexer.payments import Payments
 from hive.indexer.follow import Follow
 from hive.indexer.votes import Votes
 from hive.indexer.post_data_cache import PostDataCache
+from hive.indexer.tags import Tags
 from time import perf_counter
 
 log = logging.getLogger(__name__)
@@ -52,6 +53,7 @@ class Blocks:
         #assert is_trx_active(), "Block.process must be in a trx"
         ret = cls._process(block, vops_in_block, hived, is_initial_sync=False)
         PostDataCache.flush()
+        Tags.flush()
         Votes.flush()
         time_end = perf_counter()
         log.info("[PROCESS BLOCK] %fs", time_end - time_start)
@@ -75,6 +77,7 @@ class Blocks:
         # expensive. So is tracking follows at all; hence we track
         # deltas in memory and update follow/er counts in bulk.
         PostDataCache.flush()
+        Tags.flush()
         Votes.flush()
         cls._flush_blocks()
         Follow.flush(trx=False)
diff --git a/hive/indexer/post_data_cache.py b/hive/indexer/post_data_cache.py
index 83514c05a..905afc3e8 100644
--- a/hive/indexer/post_data_cache.py
+++ b/hive/indexer/post_data_cache.py
@@ -5,6 +5,7 @@ log = logging.getLogger(__name__)
 DB = Db.instance()
 
 def escape_characters(text):
+    """ Escape special charactes """
     ret = str(text)
     ret = ret.replace("\\", "\\\\")
     ret = ret.replace("'", "''")
diff --git a/hive/indexer/posts.py b/hive/indexer/posts.py
index d060fe44c..78322c5c2 100644
--- a/hive/indexer/posts.py
+++ b/hive/indexer/posts.py
@@ -3,7 +3,7 @@
 import logging
 import collections
 
-from json import dumps
+from json import dumps, loads
 
 from hive.db.adapter import Db
 from hive.db.db_state import DbState
@@ -13,6 +13,7 @@ from hive.indexer.feed_cache import FeedCache
 from hive.indexer.community import Community, START_DATE
 from hive.indexer.notify import Notify
 from hive.indexer.post_data_cache import PostDataCache
+from hive.indexer.tags import Tags
 from hive.utils.normalize import legacy_amount, asset_to_hbd_hive
 
 log = logging.getLogger(__name__)
@@ -113,6 +114,26 @@ class Posts:
                          json=op['json_metadata'] if op['json_metadata'] else '{}')
         PostDataCache.add_data(result['id'], post_data)
 
+        md = {}
+        # At least one case where jsonMetadata was double-encoded: condenser#895
+        # jsonMetadata = JSON.parse(jsonMetadata);
+        try:
+            md = loads(op['json_metadata'])
+            if not isinstance(md, dict):
+                md = {}
+        except Exception:
+            pass
+
+        tags = [op['parent_permlink']]
+        if md and 'tags' in md and isinstance(md['tags'], list):
+            tags = tags + md['tags']
+        tags = map(lambda tag: (str(tag) or '').strip('# ').lower()[:32], tags)
+        tags = filter(None, tags)
+        from funcy.seqs import distinct
+        tags = list(distinct(tags))[:5]
+        for tag in tags:
+            Tags.add_tag(result['id'], tag)
+
         if not DbState.is_initial_sync():
             if error:
                 author_id = result['author_id']
diff --git a/hive/indexer/tags.py b/hive/indexer/tags.py
new file mode 100644
index 000000000..5c258dc8f
--- /dev/null
+++ b/hive/indexer/tags.py
@@ -0,0 +1,42 @@
+import logging
+from hive.db.adapter import Db
+
+log = logging.getLogger(__name__)
+DB = Db.instance()
+
+class Tags(object):
+    """ Tags cache """
+    _tags = []
+
+    @classmethod
+    def add_tag(cls, tid, tag):
+        """ Add tag to cache """
+        cls._tags.append((tid, tag))
+
+    @classmethod
+    def flush(cls):
+        """ Flush tags to table """
+        if cls._tags:
+            sql = """
+                INSERT INTO
+                    hive_tag_data (tag)
+                VALUES 
+            """
+            values = []
+            for tag in cls._tags:
+                values.append("('{}')".format(tag[1]))
+            sql += ",".join(values)
+            sql += " ON CONFLICT DO NOTHING;"
+
+            sql += """
+                INSERT INTO
+                    hive_post_tags (post_id, tag_id)
+                VALUES 
+            """
+            values = []
+            for tag in cls._tags:
+                values.append("({}, (SELECT id FROM hive_tag_data WHERE tag='{}'))".format(tag[0], tag[1]))
+            sql += ",".join(values)
+            sql += " ON CONFLICT DO NOTHING"
+            DB.query(sql)
+            cls._tags.clear()
diff --git a/hive/server/bridge_api/cursor.py b/hive/server/bridge_api/cursor.py
index fbf884d81..c65baf416 100644
--- a/hive/server/bridge_api/cursor.py
+++ b/hive/server/bridge_api/cursor.py
@@ -180,7 +180,7 @@ async def pids_by_category(db, tag, sort, last_id, limit):
         if sort in ['payout', 'payout_comments']:
             where.append('category_id = (SELECT id FROM hive_category_data WHERE category = :tag)')
         else:
-            sql = "SELECT post_id FROM hive_post_tags WHERE tag = :tag"
+            sql = "SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag)"
             where.append("id IN (%s)" % sql)
 
     if last_id:
diff --git a/hive/server/bridge_api/methods.py b/hive/server/bridge_api/methods.py
index 11d1ef1a0..d473ff1cb 100644
--- a/hive/server/bridge_api/methods.py
+++ b/hive/server/bridge_api/methods.py
@@ -191,7 +191,7 @@ async def get_ranked_posts(context, sort, start_author='', start_permlink='',
         if sort in ['payout', 'payout_comments']:
             sql = sql % """ AND hp.category = :tag """
         else:
-            sql = sql % """ AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag = :tag)"""
+            sql = sql % """ AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag))"""
 
     if not observer:
         observer = ''
diff --git a/hive/server/condenser_api/cursor.py b/hive/server/condenser_api/cursor.py
index f5cbaf87a..980fbc1c8 100644
--- a/hive/server/condenser_api/cursor.py
+++ b/hive/server/condenser_api/cursor.py
@@ -175,7 +175,7 @@ async def pids_by_query(db, sort, start_author, start_permlink, limit, tag):
                 where.append('category_id = (SELECT id FROM hive_category_data WHERE category = :tag)')
                 if sort in ('trending', 'hot'):
                     where.append('depth = 0')
-            sql = "SELECT post_id FROM hive_post_tags WHERE tag = :tag"
+            sql = "SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag)"
             where.append("id IN (%s)" % sql)
 
     start_id = None
diff --git a/hive/server/condenser_api/methods.py b/hive/server/condenser_api/methods.py
index d6a104a38..00afdeba6 100644
--- a/hive/server/condenser_api/methods.py
+++ b/hive/server/condenser_api/methods.py
@@ -251,7 +251,7 @@ async def get_discussions_by(discussion_type, context, start_author: str = '',
         if tag[:5] == 'hive-':
             sql = sql % """ %s AND hp.category = :tag """
         else:
-            sql = sql % """ %s AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag = :tag) """
+            sql = sql % """ %s AND hp.post_id IN (SELECT post_id FROM hive_post_tags WHERE tag_id = (SELECT id FROM hive_tag_data WHERE tag = :tag)) """
 
     if start_author and start_permlink:
         if discussion_type == 'trending':
-- 
GitLab