From 65138c5cd99a4f55a092a74014e48ce76f81ff27 Mon Sep 17 00:00:00 2001 From: ABW <andrzejl@syncad.com> Date: Fri, 4 Sep 2020 19:54:19 +0200 Subject: [PATCH] [ABW]: some characters apparently need long 8-byte encoding (problem actually caused by use of ujson) --- hive/utils/normalize.py | 26 +++++++++++++++----------- tests/tests_api | 2 +- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/hive/utils/normalize.py b/hive/utils/normalize.py index 0894cd1df..0fe4f5d13 100644 --- a/hive/utils/normalize.py +++ b/hive/utils/normalize.py @@ -78,19 +78,23 @@ def escape_characters(text): if ch in SPECIAL_CHARS: dw = SPECIAL_CHARS[ch] ret = ret + dw - elif ch.isprintable(): - ret = ret + ch else: - # escaped_value = ch.encode('unicode-escape').decode('utf-8') ordinal = ord(ch) - hexstr = hex(ordinal)[2:] - escaped_value = '\\u' - i = len(hexstr) - while i < 4: - escaped_value += '0' - i += 1 - escaped_value += hexstr - ret = ret + escaped_value + if ordinal <= 0x80 and ch.isprintable(): + ret = ret + ch + else: + hexstr = hex(ordinal)[2:] + i = len(hexstr) + max = 4 + escaped_value = '\\u' + if i > max: + max = 8 + escaped_value = '\\U' + while i < max: + escaped_value += '0' + i += 1 + escaped_value += hexstr + ret = ret + escaped_value ret = ret + "'" return ret diff --git a/tests/tests_api b/tests/tests_api index 4ee51004b..fa660ef0e 160000 --- a/tests/tests_api +++ b/tests/tests_api @@ -1 +1 @@ -Subproject commit 4ee51004b4d83d2c12ca8f6e10faab762cc0262f +Subproject commit fa660ef0ee019ba9c2da91e7a9140423593d944e -- GitLab