diff --git a/.gitlab-ci.yaml b/.gitlab-ci.yaml index cd2caefc925aee5da88b11f7c88fbbef180f9dbb..84a907db34ecf1fe9a49b5904082b2457b70eaa8 100644 --- a/.gitlab-ci.yaml +++ b/.gitlab-ci.yaml @@ -84,6 +84,9 @@ hivemind_stop_server: script: - scripts/ci_stop_server.sh hive_server.pid + tags: + - hivemind + hivemind_start_server: stage: deploy environment: @@ -126,7 +129,7 @@ hivemind_start_api_smoketest: - python3 -m site --user-site - pip3 install --user --upgrade pip setuptools - pip3 install --user pyresttest - - scripts/ci_start_api_smoketest.sh "http://localhost/$HIVEMIND_HTTP_PORT" + - scripts/ci_start_api_smoketest.sh "http://localhost:$HIVEMIND_HTTP_PORT" when: manual diff --git a/Pipfile b/Pipfile index f779e7286eeae7628fcff358affd95068088980e..a69a9d5b3ec308e68a967aae10b8b1d251e7a7eb 100644 --- a/Pipfile +++ b/Pipfile @@ -15,7 +15,7 @@ jsonrpcserver = "4.1.3+8f3437a" aiohttp = "*" aiopg = "*" "psycopg2-binary" = "*" - +"diff-match-patch" = "*" [dev-packages] diff --git a/hive/db/schema.py b/hive/db/schema.py index 0b733eae5873d68f50e6403ab1faf279489b701b..f0df0af90e65d2faa36d483a9ebb3b6504ab713d 100644 --- a/hive/db/schema.py +++ b/hive/db/schema.py @@ -424,7 +424,7 @@ def setup(db): in _parent_permlink hive_permlink_data.permlink%TYPE, in _date hive_posts.created_at%TYPE, in _community_support_start_date hive_posts.created_at%TYPE) - RETURNS TABLE (id hive_posts.id%TYPE, author_id hive_posts.author_id%TYPE, permlink_id hive_posts.permlink_id%TYPE, + RETURNS TABLE (is_new_post boolean, id hive_posts.id%TYPE, author_id hive_posts.author_id%TYPE, permlink_id hive_posts.permlink_id%TYPE, post_category hive_category_data.category%TYPE, parent_id hive_posts.parent_id%TYPE, community_id hive_posts.community_id%TYPE, is_valid hive_posts.is_valid%TYPE, is_muted hive_posts.is_muted%TYPE, depth hive_posts.depth%TYPE, is_edited boolean) @@ -487,7 +487,7 @@ def setup(db): END ) - RETURNING hp.id, hp.author_id, hp.permlink_id, (SELECT hcd.category FROM hive_category_data hcd WHERE hcd.id = hp.category_id) as post_category, hp.parent_id, hp.community_id, hp.is_valid, hp.is_muted, hp.depth, (hp.updated_at > hp.created_at) as is_edited + RETURNING (xmax = 0) as is_new_post, hp.id, hp.author_id, hp.permlink_id, (SELECT hcd.category FROM hive_category_data hcd WHERE hcd.id = hp.category_id) as post_category, hp.parent_id, hp.community_id, hp.is_valid, hp.is_muted, hp.depth, (hp.updated_at > hp.created_at) as is_edited ; ELSE INSERT INTO hive_category_data @@ -536,7 +536,7 @@ def setup(db): END ) - RETURNING hp.id, hp.author_id, hp.permlink_id, _parent_permlink as post_category, hp.parent_id, hp.community_id, hp.is_valid, hp.is_muted, hp.depth, (hp.updated_at > hp.created_at) as is_edited + RETURNING (xmax = 0) as is_new_post, hp.id, hp.author_id, hp.permlink_id, _parent_permlink as post_category, hp.parent_id, hp.community_id, hp.is_valid, hp.is_muted, hp.depth, (hp.updated_at > hp.created_at) as is_edited ; END IF; END diff --git a/hive/indexer/post_data_cache.py b/hive/indexer/post_data_cache.py index 1459f900dd6f63b8c4b8a29f28e6d8b7d5d1d7c2..8c578b60e5daf93bd706876fa56e2c4bee382a37 100644 --- a/hive/indexer/post_data_cache.py +++ b/hive/indexer/post_data_cache.py @@ -15,12 +15,25 @@ class PostDataCache(object): return pid in cls._data @classmethod - def add_data(cls, pid, post_data): + def add_data(cls, pid, post_data, print_query = False): """ Add data to cache """ cls._data[pid] = post_data @classmethod - def flush(cls): + def get_post_body(cls, pid): + """ Returns body of given post from collected cache or from underlying DB storage. """ + try: + post_data = cls._data[pid] + except KeyError: + sql = """ + SELECT hpd.body FROM hive_post_data hpd WHERE hpd.id = :post_id; + """ + row = DB.query_row(sql, post_id = pid) + post_data = dict(row) + return post_data['body'] + + @classmethod + def flush(cls, print_query = False): """ Flush data from cache to db """ if cls._data: sql = """ @@ -49,5 +62,9 @@ class PostDataCache(object): WHERE hive_post_data.id = EXCLUDED.id """ + + if(print_query): + log.info("Executing query:\n{}".format(sql)) + DB.query(sql) cls._data.clear() diff --git a/hive/indexer/posts.py b/hive/indexer/posts.py index 6fae17d4bf2b5ce0279b327389f432df916cbefd..89d3e80fb726f0b4aa5fbeabb68bcc411c05ae96 100644 --- a/hive/indexer/posts.py +++ b/hive/indexer/posts.py @@ -5,6 +5,8 @@ import collections from json import dumps, loads +from diff_match_patch import diff_match_patch + from hive.db.adapter import Db from hive.db.db_state import DbState @@ -94,7 +96,7 @@ class Posts: """Register new/edited/undeleted posts; insert into feed cache.""" sql = """ - SELECT id, author_id, permlink_id, post_category, parent_id, community_id, is_valid, is_muted, depth, is_edited + SELECT is_new_post, id, author_id, permlink_id, post_category, parent_id, community_id, is_valid, is_muted, depth, is_edited FROM process_hive_post_operation((:author)::varchar, (:permlink)::varchar, (:parent_author)::varchar, (:parent_permlink)::varchar, (:date)::timestamp, (:community_support_start_date)::timestamp); """ @@ -108,11 +110,22 @@ class Posts: cls._set_id(op['author']+'/'+op['permlink'], result['id']) - # add content data to hive_post_data - post_data = dict(title=op['title'], preview=op['preview'] if 'preview' in op else "", - img_url=op['img_url'] if 'img_url' in op else "", body=op['body'], - json=op['json_metadata'] if op['json_metadata'] else '{}') - PostDataCache.add_data(result['id'], post_data) + if result['is_new_post']: + # add content data to hive_post_data + post_data = dict(title=op['title'], preview=op['preview'] if 'preview' in op else "", + img_url=op['img_url'] if 'img_url' in op else "", body=op['body'], + json=op['json_metadata'] if op['json_metadata'] else '{}') + else: + # edit case. Now we need to (potentially) apply patch to the post body. + new_body = cls._merge_post_body(id=result['id'], new_body_def=op['body']) + post_data = dict(title=op['title'], preview=op['preview'] if 'preview' in op else "", + img_url=op['img_url'] if 'img_url' in op else "", body=new_body, + json=op['json_metadata'] if op['json_metadata'] else '{}') + +# log.info("Adding author: {} permlink: {}".format(op['author'], op['permlink'])) + + printQuery = False # op['author'] == 'xeroc' and op['permlink'] == 're-piston-20160818t080811' + PostDataCache.add_data(result['id'], post_data, printQuery) md = {} # At least one case where jsonMetadata was double-encoded: condenser#895 @@ -402,3 +415,30 @@ class Posts: is_muted = True return error + @classmethod + def _merge_post_body(cls, id, new_body_def): + new_body = '' + old_body = '' + + try: + dmp = diff_match_patch() + patch = dmp.patch_fromText(new_body_def) + if patch is not None and len(patch): + old_body = PostDataCache.get_post_body(id) + new_body, _ = dmp.patch_apply(patch, old_body) + #new_utf8_body = new_body.decode('utf-8') + #new_body = new_utf8_body + else: + new_body = new_body_def + except ValueError as e: +# log.info("Merging a body post id: {} caused an ValueError exception {}".format(id, e)) +# log.info("New body definition: {}".format(new_body_def)) +# log.info("Old body definition: {}".format(old_body)) + new_body = new_body_def + except Exception as ex: + log.info("Merging a body post id: {} caused an unknown exception {}".format(id, ex)) + log.info("New body definition: {}".format(new_body_def)) + log.info("Old body definition: {}".format(old_body)) + new_body = new_body_def + + return new_body diff --git a/hive/utils/normalize.py b/hive/utils/normalize.py index ec87dd95d8b54ad81099ad705c2005b259d45388..3de9a6e8e4cb3783c70e3cae7072581af1639d57 100644 --- a/hive/utils/normalize.py +++ b/hive/utils/normalize.py @@ -20,6 +20,10 @@ dct={'0':'a','1':'b','2':'c','3':'d','4':'e', # convert special chars into their octal formats recognized by sql special_chars={ + "\r":"\\015", + "\n":"\\012", + "\v":"\\013", + "\f": "\\014", "\\":"\\134", "'":"\\047", "%":"\\045", @@ -35,11 +39,21 @@ def escape_characters(text): ret = "E'" for ch in text: - try: - dw=special_chars[ch] - ret=ret+dw - except KeyError as k: - ret=ret+ch + if ch.isprintable() or ch in special_chars: + try: + dw=special_chars[ch] + ret=ret+dw + except KeyError as k: + ret=ret+ch + else: + ordinal = ord(ch) + if ordinal == 0 or ordinal >= 0x80: + escaped_value = 'u' + hex(ordinal)[2:] +# logging.info("Encoded unicode escape: {}".format(escaped_value)) + else: + escaped_value = ch.encode('unicode-escape').decode('utf-8') + + ret = ret + escaped_value ret = ret + "'" return ret diff --git a/requirements.txt b/requirements.txt index 1234f7b2855f4621b35046ecd2ab119ac95b6a61..be59f36825b5c919a812beb09ef48df0bd4384e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -39,3 +39,4 @@ tzlocal==2.0.0 ujson==2.0.3 urllib3==1.25.9 yarl==1.4.2 +diff-match-patch=20200713 diff --git a/setup.py b/setup.py index 0ed03896f514b29bea4017ca22332127f6e6274b..7ade2c64711c120aa1fb42e054df1446ad373ce8 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ setup( 'aiocache', 'configargparse', 'pdoc', + 'diff-match-patch' ], extras_require={'test': tests_require}, entry_points={