diff --git a/mssql_python/pybind/ddbc_bindings.cpp b/mssql_python/pybind/ddbc_bindings.cpp index 9b13fd63..adb97a2e 100644 --- a/mssql_python/pybind/ddbc_bindings.cpp +++ b/mssql_python/pybind/ddbc_bindings.cpp @@ -4780,9 +4780,8 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules, ColumnBuffers buffers(numCols, fetchSize); if (!hasLobColumns && fetchSize > 0) { - // Bind columns — Arrow always uses SQL_C_CHAR for VARCHAR because - // it processes raw byte buffers directly, not via Python codecs. - ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize, SQL_C_CHAR); + // Always request WCHARs so we don't have to deal with CHAR encodings + ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize, SQL_C_WCHAR); if (!SQL_SUCCEEDED(ret)) { LOG("Error when binding columns"); return ret; @@ -4841,16 +4840,7 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules, } case SQL_CHAR: case SQL_VARCHAR: - case SQL_LONGVARCHAR: { - ret = GetDataVar(hStmt, idxCol + 1, SQL_C_CHAR, - buffers.charBuffers[idxCol], - buffers.indicators[idxCol].data()); - if (!SQL_SUCCEEDED(ret)) { - LOG("Error fetching CHAR LOB for column %d", idxCol + 1); - return ret; - } - break; - } + case SQL_LONGVARCHAR: case SQL_SS_XML: case SQL_WCHAR: case SQL_WVARCHAR: @@ -5093,24 +5083,7 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules, } case SQL_CHAR: case SQL_VARCHAR: - case SQL_LONGVARCHAR: { -#if defined(__APPLE__) || defined(__linux__) - uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/; -#else - uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/; -#endif - auto target_vec = &arrowColumnProducer->varData; - auto start = arrowColumnProducer->varVal[idxRowArrow]; - while (target_vec->size() < start + dataLen) { - target_vec->resize(target_vec->size() * 2); - } - - std::memcpy(&(*target_vec)[start], - &buffers.charBuffers[idxCol][idxRowSql * fetchBufferSize], - dataLen); - arrowColumnProducer->varVal[idxRowArrow + 1] = start + dataLen; - break; - } + case SQL_LONGVARCHAR: case SQL_SS_XML: case SQL_WCHAR: case SQL_WVARCHAR: diff --git a/tests/test_004_cursor_arrow.py b/tests/test_004_cursor_arrow.py index ce6163f6..731721e8 100644 --- a/tests/test_004_cursor_arrow.py +++ b/tests/test_004_cursor_arrow.py @@ -313,6 +313,63 @@ def test_arrow_long_string(cursor: mssql_python.Cursor): assert batch.column(0).to_pylist() == [long_string] +def test_arrow_varchar_utf8_collation_unicode(cursor: mssql_python.Cursor): + table = "#t_arrow_utf8_varchar" + collation = "Latin1_General_100_CI_AS_SC_UTF8" + expected = [ + "Grüße", + "你好😀", + "こんにちは", + "Привет", + "Hello 世界", + "😀😃😄😁", + "", + None, + ] + + try: + cursor.execute( + f"create table {table} (id int primary key, v varchar(32) collate {collation})" + ) + except Exception as exc: + pytest.skip(f"UTF-8 collation '{collation}' not supported: {exc}") + + try: + for index, value in enumerate(expected, start=1): + cursor.execute(f"insert into {table} (id, v) values (?, ?)", index, value) + tbl = cursor.execute(f"select v from {table} order by id").arrow() + assert tbl.column(0).type.equals(pa.large_string()) + assert tbl.column(0).to_pylist() == expected + finally: + cursor.execute(f"drop table if exists {table}") + + +def test_arrow_varchar_utf8_collation_cp1252(cursor: mssql_python.Cursor): + table = "#t_arrow_cp1252_varchar" + collation = "SQL_Latin1_General_CP1_CI_AS" + expected = [ + "Grüße", + "café René!", + "naïve café", + "Español", + "Müller-Öztürk", + "Françoise", + "", + None, + ] + + cursor.execute(f"create table {table} (id int primary key, v varchar(32) collate {collation})") + + try: + for index, value in enumerate(expected, start=1): + cursor.execute(f"insert into {table} (id, v) values (?, ?)", index, value) + tbl = cursor.execute(f"select v from {table} order by id").arrow() + assert tbl.column(0).type.equals(pa.large_string()) + assert tbl.column(0).to_pylist() == expected + finally: + cursor.execute(f"drop table if exists {table}") + + def test_rownumber_arrow_batch_interleaved_fetchmany(cursor: mssql_python.Cursor): """Verify that arrow_batch and fetchmany can be interleaved on the same result set with correct rownumber tracking and values."""