From 8795479dbc68e80b26b61e1379165b856ef1bfc2 Mon Sep 17 00:00:00 2001 From: ABW <andrzejl@syncad.com> Date: Thu, 3 Sep 2020 00:42:32 +0200 Subject: [PATCH] [ABW]: fixed unicode escaping --- hive/utils/normalize.py | 26 ++++++++++++++------------ tests/tests_api | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/hive/utils/normalize.py b/hive/utils/normalize.py index b435cc0bd..0894cd1df 100644 --- a/hive/utils/normalize.py +++ b/hive/utils/normalize.py @@ -29,6 +29,7 @@ UNIT_NAI = { # convert special chars into their octal formats recognized by sql SPECIAL_CHARS = { + "\x00" : " ", # nul char cannot be stored in string column (ABW: if we ever find the need to store nul chars we'll need bytea, not text) "\r" : "\\015", "\n" : "\\012", "\v" : "\\013", @@ -74,20 +75,21 @@ def escape_characters(text): ret = "E'" for ch in text: - if ch.isprintable() or ch in SPECIAL_CHARS: - try: - dw = SPECIAL_CHARS[ch] - ret = ret + dw - except KeyError: - ret = ret + ch + if ch in SPECIAL_CHARS: + dw = SPECIAL_CHARS[ch] + ret = ret + dw + elif ch.isprintable(): + ret = ret + ch else: + # escaped_value = ch.encode('unicode-escape').decode('utf-8') ordinal = ord(ch) - if ordinal == 0 or ordinal >= 0x80: - escaped_value = 'u' + hex(ordinal)[2:] -# logging.info("Encoded unicode escape: {}".format(escaped_value)) - else: - escaped_value = ch.encode('unicode-escape').decode('utf-8') - + hexstr = hex(ordinal)[2:] + escaped_value = '\\u' + i = len(hexstr) + while i < 4: + escaped_value += '0' + i += 1 + escaped_value += hexstr ret = ret + escaped_value ret = ret + "'" diff --git a/tests/tests_api b/tests/tests_api index c673b555a..7d925b4e8 160000 --- a/tests/tests_api +++ b/tests/tests_api @@ -1 +1 @@ -Subproject commit c673b555aa055358e0f5a1e1401a4110f7f83ca3 +Subproject commit 7d925b4e88faafd6d0154725bfe26d7bdfaee23f -- GitLab