Skip to content

Commit fdff93f

Browse files
committed
When converting python string to ICU strings replace invalid UTF-16 surrogate codepoints with the erplacement character. Needed because various ICU functions fail if invalid surrogates are present. Fixes #1713892 [calibredb add cannot avoid duplicates](https://bugs.launchpad.net/calibre/+bug/1713892)
1 parent 684bbe6 commit fdff93f

File tree

2 files changed

+20
-9
lines changed

2 files changed

+20
-9
lines changed

src/calibre/utils/icu_calibre_utils.h

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@
2222
#include <unicode/unorm.h>
2323
#include <unicode/ubrk.h>
2424

25-
#if PY_VERSION_HEX >= 0x03030000
25+
#if PY_VERSION_HEX >= 0x03030000
2626
#error Not implemented for python >= 3.3
2727
#endif
2828

2929
#define MIN(x, y) ((x)<(y)) ? (x) : (y)
30+
#define IS_HIGH_SURROGATE(x) (0xd800 <= x && x <= 0xdbff)
31+
#define IS_LOW_SURROGATE(x) (0xdc00 <= x && x <= 0xdfff)
3032

3133
// Roundtripping will need to be implemented differently for python 3.3+ where strings are stored with variable widths
3234

@@ -48,15 +50,24 @@ static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) {
4850
sz = PyUnicode_GET_SIZE(obj);
4951
ans = (UChar*) calloc(2*(sz+1), sizeof(UChar)); // There can be no more than 2 UChars per character + ensure null termination
5052
if (ans == NULL) { PyErr_NoMemory(); goto end; }
51-
u_strFromUTF32(ans, (int32_t)(2*(sz+1)), osz, (UChar32*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, &status);
53+
u_strFromUTF32WithSub(ans, (int32_t)(2*(sz+1)), osz, (UChar32*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, 0xfffd, NULL, &status);
5254
if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; }
5355
#else
5456
// narrow build (UTF-16)
55-
sz = PyUnicode_GET_DATA_SIZE(obj);
56-
ans = (UChar*) calloc(sz+2, 1); // Ensure null termination
57+
sz = PyUnicode_GET_SIZE(obj);
58+
ans = (UChar*) malloc((sz + 1) * sizeof(UChar));
5759
if (ans == NULL) { PyErr_NoMemory(); goto end; }
58-
memcpy(ans, PyUnicode_AS_UNICODE(obj), sz);
59-
if (osz != NULL) *osz = (int32_t)PyUnicode_GET_SIZE(obj);
60+
for (Py_ssize_t i = 0; i < sz; i++) {
61+
UChar ch = PyUnicode_AS_UNICODE(obj)[i];
62+
if (IS_HIGH_SURROGATE(ch) {
63+
if (i >= sz - 1 || !IS_LOW_SURROGATE(PyUnicode_AS_UNICODE(obj)[i+1])) ans[i] = 0xfffd;
64+
else { ans[i] = ch; ans[i+1] = PyUnicode_AS_UNICODE(obj)[i+1]; i++; }
65+
} else if (IS_LOW_SURROGATE(ch)) {
66+
ans[i] = 0xfffd;
67+
} else ans[i] = ch;
68+
}
69+
ans[sz] = 0; // Ensure null termination
70+
if (osz != NULL) *osz = (int32_t)sz;
6071
#endif
6172
end:
6273
return ans;
@@ -104,5 +115,3 @@ static PyObject* icu_to_python(UChar *src, int32_t sz) {
104115
#endif
105116
}
106117
#endif
107-
108-

src/calibre/utils/icu_test.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,8 @@ def test_roundtrip(self):
133133
' Test roundtripping '
134134
for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'):
135135
self.ae(r, icu._icu.roundtrip(r))
136+
self.ae(icu._icu.roundtrip('\ud8e81'), '\ufffd1')
137+
self.ae(icu._icu.roundtrip('\udc01\ud8e8'), '\ufffd\ufffd')
136138
for x, l in [('', 0), ('a', 1), ('\U0001f431', 1)]:
137139
self.ae(icu._icu.string_length(x), l)
138140
for x, l in [('', 0), ('a', 1), ('\U0001f431', 2)]:
@@ -218,6 +220,6 @@ def test_build():
218220
if not result.wasSuccessful():
219221
raise SystemExit(1)
220222

223+
221224
if __name__ == '__main__':
222225
run(verbosity=4)
223-

0 commit comments

Comments
 (0)