Merge #7892: Add full UTF-8 support to RPC

7982fce doc: Mention full UTF-8 support in release notes (Wladimir J. van der Laan) 6bbb4ef test: test utf-8 for labels in wallet (Wladimir J. van der Laan) a406fcb test: add ensure_ascii setting to AuthServiceProxy (Wladimir J. van der Laan) 60ab9b2 Squashed 'src/univalue/' changes from 2740c4f..f32df99 (Wladimir J. van der Laan)
2016-06-16 12:08:18 +02:00 · 2016-06-16 12:08:18 +02:00 · 9c3d0fab36
commit 9c3d0fab36
parent 3f89a534ac 7982fce64c
14 changed files with 209 additions and 44 deletions
--- a/doc/release-notes.md
+++ b/doc/release-notes.md
@ -43,6 +43,11 @@ RPC low-level changes
  32-bit and 64-bit platforms, and the txids were missing in the hashed data. This has been
  fixed, but this means that the output will be different than from previous versions.
 - Full UTF-8 support in the RPC API. Non-ASCII characters in, for example,
  wallet labels have always been malformed because they weren't taken into account
  properly in JSON RPC processing. This is no longer the case. This also affects
  the GUI debug console.
 C++11 and Python 3
 -------------------
--- a/qa/rpc-tests/test_framework/authproxy.py
+++ b/qa/rpc-tests/test_framework/authproxy.py
@ -67,9 +67,11 @@ def EncodeDecimal(o):
 class AuthServiceProxy(object):
    __id_count = 0
-    def __init__(self, service_url, service_name=None, timeout=HTTP_TIMEOUT, connection=None):
+    # ensure_ascii: escape unicode as \uXXXX, passed to json.dumps
    def __init__(self, service_url, service_name=None, timeout=HTTP_TIMEOUT, connection=None, ensure_ascii=True):
        self.__service_url = service_url
        self._service_name = service_name
        self.ensure_ascii = ensure_ascii # can be toggled on the fly by tests
        self.__url = urlparse.urlparse(service_url)
        if self.__url.port is None:
            port = 80
@ -134,12 +136,12 @@ class AuthServiceProxy(object):
        AuthServiceProxy.__id_count += 1
        log.debug("-%s-> %s %s"%(AuthServiceProxy.__id_count, self._service_name,
-                                 json.dumps(args, default=EncodeDecimal)))
+                                 json.dumps(args, default=EncodeDecimal, ensure_ascii=self.ensure_ascii)))
        postdata = json.dumps({'version': '1.1',
                               'method': self._service_name,
                               'params': args,
-                               'id': AuthServiceProxy.__id_count}, default=EncodeDecimal)
+                               'id': AuthServiceProxy.__id_count}, default=EncodeDecimal, ensure_ascii=self.ensure_ascii)
-        response = self._request('POST', self.__url.path, postdata)
+        response = self._request('POST', self.__url.path, postdata.encode('utf-8'))
        if response['error'] is not None:
            raise JSONRPCException(response['error'])
        elif 'result' not in response:
@ -149,9 +151,9 @@ class AuthServiceProxy(object):
            return response['result']
    def _batch(self, rpc_call_list):
-        postdata = json.dumps(list(rpc_call_list), default=EncodeDecimal)
+        postdata = json.dumps(list(rpc_call_list), default=EncodeDecimal, ensure_ascii=self.ensure_ascii)
        log.debug("--> "+postdata)
-        return self._request('POST', self.__url.path, postdata)
+        return self._request('POST', self.__url.path, postdata.encode('utf-8'))
    def _get_response(self):
        http_response = self.__conn.getresponse()
@ -167,7 +169,7 @@ class AuthServiceProxy(object):
        responsedata = http_response.read().decode('utf8')
        response = json.loads(responsedata, parse_float=decimal.Decimal)
        if "error" in response and response["error"] is None:
-            log.debug("<-%s- %s"%(response["id"], json.dumps(response["result"], default=EncodeDecimal)))
+            log.debug("<-%s- %s"%(response["id"], json.dumps(response["result"], default=EncodeDecimal, ensure_ascii=self.ensure_ascii)))
        else:
            log.debug("<-- "+responsedata)
        return response
--- a/qa/rpc-tests/wallet.py
+++ b/qa/rpc-tests/wallet.py
@ -309,6 +309,20 @@ class WalletTest (BitcoinTestFramework):
        balance_nodes = [self.nodes[i].getbalance() for i in range(3)]
        block_count = self.nodes[0].getblockcount()
        # Check modes:
        #   - True: unicode escaped as \u....
        #   - False: unicode directly as UTF-8
        for mode in [True, False]:
            self.nodes[0].ensure_ascii = mode
            # unicode check: Basic Multilingual Plane, Supplementary Plane respectively
            for s in [u'рыба', u'𝅘𝅥𝅯']:
                addr = self.nodes[0].getaccountaddress(s)
                label = self.nodes[0].getaccount(addr)
                assert_equal(label, s)
                assert(s in self.nodes[0].listaccounts().keys())
        self.nodes[0].ensure_ascii = True # restore to default
        # maintenance tests
        maintenance = [
            '-rescan',
            '-reindex',
--- a/src/univalue/Makefile.am
+++ b/src/univalue/Makefile.am
@ -3,7 +3,7 @@ ACLOCAL_AMFLAGS = -I build-aux/m4
 .INTERMEDIATE: $(GENBIN)
 include_HEADERS = include/univalue.h
-noinst_HEADERS = lib/univalue_escapes.h
+noinst_HEADERS = lib/univalue_escapes.h lib/univalue_utffilter.h
 lib_LTLIBRARIES = libunivalue.la
@ -73,6 +73,10 @@ TEST_FILES = \
 	$(TEST_DATA_DIR)/fail35.json \
 	$(TEST_DATA_DIR)/fail36.json \
 	$(TEST_DATA_DIR)/fail37.json \
 	$(TEST_DATA_DIR)/fail38.json \
 	$(TEST_DATA_DIR)/fail39.json \
 	$(TEST_DATA_DIR)/fail40.json \
 	$(TEST_DATA_DIR)/fail41.json \
 	$(TEST_DATA_DIR)/fail3.json \
 	$(TEST_DATA_DIR)/fail4.json \
 	$(TEST_DATA_DIR)/fail5.json \
@ -83,6 +87,7 @@ TEST_FILES = \
 	$(TEST_DATA_DIR)/pass1.json \
 	$(TEST_DATA_DIR)/pass2.json \
 	$(TEST_DATA_DIR)/pass3.json \
-	$(TEST_DATA_DIR)/round1.json
+	$(TEST_DATA_DIR)/round1.json \
 	$(TEST_DATA_DIR)/round2.json
 EXTRA_DIST=$(TEST_FILES) $(GEN_SRCS)
--- a/src/univalue/configure.ac
+++ b/src/univalue/configure.ac
@ -1,7 +1,7 @@
 m4_define([libunivalue_major_version], [1])
 m4_define([libunivalue_minor_version], [1])
-m4_define([libunivalue_micro_version], [1])
+m4_define([libunivalue_micro_version], [2])
-m4_define([libunivalue_interface_age], [1])
+m4_define([libunivalue_interface_age], [2])
 # If you need a modifier for the version number. 
 # Normally empty, but can be used to make "fixup" releases.
 m4_define([libunivalue_extraversion], [])
@ -14,7 +14,7 @@ m4_define([libunivalue_age], [m4_eval(libunivalue_binary_age - libunivalue_inter
 m4_define([libunivalue_version], [libunivalue_major_version().libunivalue_minor_version().libunivalue_micro_version()libunivalue_extraversion()])
-AC_INIT([univalue], [1.0.1],
+AC_INIT([univalue], [1.0.2],
        [http://github.com/jgarzik/univalue/])
 dnl make the compilation flags quiet unless V=1 is used
--- a/src/univalue/lib/univalue_read.cpp
+++ b/src/univalue/lib/univalue_read.cpp
@ -6,6 +6,7 @@
 #include <vector>
 #include <stdio.h>
 #include "univalue.h"
 #include "univalue_utffilter.h"
 using namespace std;
@ -174,41 +175,31 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed,
        raw++;                                // skip "
        string valStr;
        JSONUTF8StringFilter writer(valStr);
        while (*raw) {
-            if (*raw < 0x20)
+            if ((unsigned char)*raw < 0x20)
                return JTOK_ERR;
            else if (*raw == '\\') {
                raw++;                        // skip backslash
                switch (*raw) {
-                case '"':  valStr += "\""; break;
+                case '"':  writer.push_back('\"'); break;
-                case '\\': valStr += "\\"; break;
+                case '\\': writer.push_back('\\'); break;
-                case '/':  valStr += "/"; break;
+                case '/':  writer.push_back('/'); break;
-                case 'b':  valStr += "\b"; break;
+                case 'b':  writer.push_back('\b'); break;
-                case 'f':  valStr += "\f"; break;
+                case 'f':  writer.push_back('\f'); break;
-                case 'n':  valStr += "\n"; break;
+                case 'n':  writer.push_back('\n'); break;
-                case 'r':  valStr += "\r"; break;
+                case 'r':  writer.push_back('\r'); break;
-                case 't':  valStr += "\t"; break;
+                case 't':  writer.push_back('\t'); break;
                case 'u': {
                    unsigned int codepoint;
                    if (hatoui(raw + 1, raw + 1 + 4, codepoint) !=
                               raw + 1 + 4)
                        return JTOK_ERR;
-
+                    writer.push_back_u(codepoint);
                    if (codepoint <= 0x7f)
                        valStr.push_back((char)codepoint);
                    else if (codepoint <= 0x7FF) {
                        valStr.push_back((char)(0xC0 | (codepoint >> 6)));
                        valStr.push_back((char)(0x80 | (codepoint & 0x3F)));
                    } else if (codepoint <= 0xFFFF) {
                        valStr.push_back((char)(0xE0 | (codepoint >> 12)));
                        valStr.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
                        valStr.push_back((char)(0x80 | (codepoint & 0x3F)));
                    }
                    raw += 4;
                    break;
                    }
@ -226,11 +217,13 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed,
            }
            else {
-                valStr += *raw;
+                writer.push_back(*raw);
                raw++;
            }
        }
        if (!writer.finalize())
            return JTOK_ERR;
        tokenVal = valStr;
        consumed = (raw - rawStart);
        return JTOK_STRING;
--- a/src/univalue/lib/univalue_utffilter.h
+++ b/src/univalue/lib/univalue_utffilter.h
@ -0,0 +1,119 @@
 // Copyright 2016 Wladimir J. van der Laan
 // Distributed under the MIT software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
 #ifndef UNIVALUE_UTFFILTER_H
 #define UNIVALUE_UTFFILTER_H
 #include <string>
 /**
 * Filter that generates and validates UTF-8, as well as collates UTF-16
 * surrogate pairs as specified in RFC4627.
 */
 class JSONUTF8StringFilter
 {
 public:
    JSONUTF8StringFilter(std::string &s):
        str(s), is_valid(true), codepoint(0), state(0), surpair(0)
    {
    }
    // Write single 8-bit char (may be part of UTF-8 sequence)
    void push_back(unsigned char ch)
    {
        if (state == 0) {
            if (ch < 0x80) // 7-bit ASCII, fast direct pass-through
                str.push_back(ch);
            else if (ch < 0xc0) // Mid-sequence character, invalid in this state
                is_valid = false;
            else if (ch < 0xe0) { // Start of 2-byte sequence
                codepoint = (ch & 0x1f) << 6;
                state = 6;
            } else if (ch < 0xf0) { // Start of 3-byte sequence
                codepoint = (ch & 0x0f) << 12;
                state = 12;
            } else if (ch < 0xf8) { // Start of 4-byte sequence
                codepoint = (ch & 0x07) << 18;
                state = 18;
            } else // Reserved, invalid
                is_valid = false;
        } else {
            if ((ch & 0xc0) != 0x80) // Not a continuation, invalid
                is_valid = false;
            state -= 6;
            codepoint |= (ch & 0x3f) << state;
            if (state == 0)
                push_back_u(codepoint);
        }
    }
    // Write codepoint directly, possibly collating surrogate pairs
    void push_back_u(unsigned int codepoint)
    {
        if (state) // Only accept full codepoints in open state
            is_valid = false;
        if (codepoint >= 0xD800 && codepoint < 0xDC00) { // First half of surrogate pair
            if (surpair) // Two subsequent surrogate pair openers - fail
                is_valid = false;
            else
                surpair = codepoint;
        } else if (codepoint >= 0xDC00 && codepoint < 0xE000) { // Second half of surrogate pair
            if (surpair) { // Open surrogate pair, expect second half
                // Compute code point from UTF-16 surrogate pair
                append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint - 0xDC00));
                surpair = 0;
            } else // Second half doesn't follow a first half - fail
                is_valid = false;
        } else {
            if (surpair) // First half of surrogate pair not followed by second - fail
                is_valid = false;
            else
                append_codepoint(codepoint);
        }
    }
    // Check that we're in a state where the string can be ended
    // No open sequences, no open surrogate pairs, etc
    bool finalize()
    {
        if (state || surpair)
            is_valid = false;
        return is_valid;
    }
 private:
    std::string &str;
    bool is_valid;
    // Current UTF-8 decoding state
    unsigned int codepoint;
    int state; // Top bit to be filled in for next UTF-8 byte, or 0
    // Keep track of the following state to handle the following section of
    // RFC4627:
    //
    //    To escape an extended character that is not in the Basic Multilingual
    //    Plane, the character is represented as a twelve-character sequence,
    //    encoding the UTF-16 surrogate pair.  So, for example, a string
    //    containing only the G clef character (U+1D11E) may be represented as
    //    "\uD834\uDD1E".
    //
    //  Two subsequent \u.... may have to be replaced with one actual codepoint.
    unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0
    void append_codepoint(unsigned int codepoint)
    {
        if (codepoint <= 0x7f)
            str.push_back((char)codepoint);
        else if (codepoint <= 0x7FF) {
            str.push_back((char)(0xC0 | (codepoint >> 6)));
            str.push_back((char)(0x80 | (codepoint & 0x3F)));
        } else if (codepoint <= 0xFFFF) {
            str.push_back((char)(0xE0 | (codepoint >> 12)));
            str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
            str.push_back((char)(0x80 | (codepoint & 0x3F)));
        } else if (codepoint <= 0x1FFFFF) {
            str.push_back((char)(0xF0 | (codepoint >> 18)));
            str.push_back((char)(0x80 | ((codepoint >> 12) & 0x3F)));
            str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
            str.push_back((char)(0x80 | (codepoint & 0x3F)));
        }
    }
 };
 #endif
--- a/src/univalue/lib/univalue_write.cpp
+++ b/src/univalue/lib/univalue_write.cpp
@ -8,8 +8,6 @@
 #include "univalue.h"
 #include "univalue_escapes.h"
 // TODO: Using UTF8
 using namespace std;
 static string json_escape(const string& inS)
@ -23,15 +21,8 @@ static string json_escape(const string& inS)
        if (escStr)
            outS += escStr;
-
+        else
        else if (ch < 0x80)
            outS += ch;
        else { // TODO handle UTF-8 properly
            char tmpesc[16];
            sprintf(tmpesc, "\\u%04x", ch);
            outS += tmpesc;
        }
    }
    return outS;
--- a/src/univalue/test/fail38.json
+++ b/src/univalue/test/fail38.json
@ -0,0 +1 @@
 ["\ud834"]
--- a/src/univalue/test/fail39.json
+++ b/src/univalue/test/fail39.json
@ -0,0 +1 @@
 ["\udd61"]
--- a/src/univalue/test/fail40.json
+++ b/src/univalue/test/fail40.json
@ -0,0 +1 @@
 ["揣｡"]
--- a/src/univalue/test/fail41.json
+++ b/src/univalue/test/fail41.json
@ -0,0 +1 @@
 ["<22><><EFBFBD>"]
--- a/src/univalue/test/round2.json
+++ b/src/univalue/test/round2.json
@ -0,0 +1 @@
 ["a§■𐎒𝅘𝅥𝅯"]
--- a/src/univalue/test/unitester.cpp
+++ b/src/univalue/test/unitester.cpp
@ -22,6 +22,7 @@ string srcdir(JSON_TEST_SRC);
 static bool test_failed = false;
 #define d_assert(expr) { if (!(expr)) { test_failed = true; fprintf(stderr, "%s failed\n", filename.c_str()); } }
 #define f_assert(expr) { if (!(expr)) { test_failed = true; fprintf(stderr, "%s failed\n", __func__); } }
 static std::string rtrim(std::string s)
 {
@ -108,6 +109,10 @@ static const char *filenames[] = {
        "fail35.json",
        "fail36.json",
        "fail37.json",
        "fail38.json",               // invalid unicode: only first half of surrogate pair
        "fail39.json",               // invalid unicode: only second half of surrogate pair
        "fail40.json",               // invalid unicode: broken UTF-8
        "fail41.json",               // invalid unicode: unfinished UTF-8
        "fail3.json",
        "fail4.json",                // extra comma
        "fail5.json",
@ -119,14 +124,40 @@ static const char *filenames[] = {
        "pass2.json",
        "pass3.json",
        "round1.json",              // round-trip test
        "round2.json",              // unicode
 };
 // Test \u handling
 void unescape_unicode_test()
 {
    UniValue val;
    bool testResult;
    // Escaped ASCII (quote)
    testResult = val.read("[\"\\u0022\"]");
    f_assert(testResult);
    f_assert(val[0].get_str() == "\"");
    // Escaped Basic Plane character, two-byte UTF-8
    testResult = val.read("[\"\\u0191\"]");
    f_assert(testResult);
    f_assert(val[0].get_str() == "\xc6\x91");
    // Escaped Basic Plane character, three-byte UTF-8
    testResult = val.read("[\"\\u2191\"]");
    f_assert(testResult);
    f_assert(val[0].get_str() == "\xe2\x86\x91");
    // Escaped Supplementary Plane character U+1d161
    testResult = val.read("[\"\\ud834\\udd61\"]");
    f_assert(testResult);
    f_assert(val[0].get_str() == "\xf0\x9d\x85\xa1");
 }
 int main (int argc, char *argv[])
 {
    for (unsigned int fidx = 0; fidx < ARRAY_SIZE(filenames); fidx++) {
        runtest_file(filenames[fidx]);
    }
    unescape_unicode_test();
    return test_failed ? 1 : 0;
 }