1818#include "mc-text-search-str-encode-private.h"
1919#include "mongocrypt-buffer-private.h"
2020#include "mongocrypt.h"
21+ #include "unicode/fold.h"
2122#include <bson/bson.h>
2223#include <stdint.h>
2324
@@ -170,23 +171,47 @@ static uint32_t mc_get_utf8_codepoint_length(const char *buf, uint32_t len) {
170171 return codepoint_len ;
171172}
172173
173- // TODO MONGOCRYPT-759 This helper only exists to test folded len != unfolded len; make the test actually use folding
174- mc_str_encode_sets_t * mc_text_search_str_encode_helper (const mc_FLE2TextSearchInsertSpec_t * spec ,
175- uint32_t unfolded_codepoint_len ,
176- mongocrypt_status_t * status ) {
174+ mc_str_encode_sets_t * mc_text_search_str_encode (const mc_FLE2TextSearchInsertSpec_t * spec ,
175+ mongocrypt_status_t * status ) {
177176 BSON_ASSERT_PARAM (spec );
177+ if (spec -> len > MAX_ENCODE_BYTE_LEN ) {
178+ CLIENT_ERR ("StrEncode: String passed in was too long: String was %u bytes, but max is %u bytes" ,
179+ spec -> len ,
180+ MAX_ENCODE_BYTE_LEN );
181+ return NULL ;
182+ }
178183
179184 if (!bson_utf8_validate (spec -> v , spec -> len , false /* allow_null */ )) {
180185 CLIENT_ERR ("StrEncode: String passed in was not valid UTF-8" );
181186 return NULL ;
182187 }
188+ uint32_t unfolded_codepoint_len = mc_get_utf8_codepoint_length (spec -> v , spec -> len );
189+ if (unfolded_codepoint_len == 0 ) {
190+ // Empty string: We set unfolded length to 1 so that we generate fake tokens.
191+ unfolded_codepoint_len = 1 ;
192+ }
183193
184- const char * folded_str = spec -> v ;
185- uint32_t folded_str_bytes_len = spec -> len ;
194+ mc_utf8_string_with_bad_char_t * base_string ;
195+ if (spec -> casef || spec -> diacf ) {
196+ char * folded_str ;
197+ size_t folded_str_bytes_len ;
198+ if (!unicode_fold (spec -> v ,
199+ spec -> len ,
200+ (spec -> casef * kUnicodeFoldToLower ) | (spec -> diacf * kUnicodeFoldRemoveDiacritics ),
201+ & folded_str ,
202+ & folded_str_bytes_len ,
203+ status )) {
204+ return NULL ;
205+ }
206+ base_string = mc_utf8_string_with_bad_char_from_buffer (folded_str , (uint32_t )folded_str_bytes_len );
207+ bson_free (folded_str );
208+ } else {
209+ base_string = mc_utf8_string_with_bad_char_from_buffer (spec -> v , spec -> len );
210+ }
186211
187212 mc_str_encode_sets_t * sets = bson_malloc0 (sizeof (mc_str_encode_sets_t ));
188213 // Base string is the folded string plus the 0xFF character
189- sets -> base_string = mc_utf8_string_with_bad_char_from_buffer ( folded_str , folded_str_bytes_len ) ;
214+ sets -> base_string = base_string ;
190215 if (spec -> suffix .set ) {
191216 sets -> suffix_set = generate_suffix_tree (sets -> base_string , unfolded_codepoint_len , & spec -> suffix .value );
192217 }
@@ -204,33 +229,11 @@ mc_str_encode_sets_t *mc_text_search_str_encode_helper(const mc_FLE2TextSearchIn
204229 }
205230 sets -> substring_set = generate_substring_tree (sets -> base_string , unfolded_codepoint_len , & spec -> substr .value );
206231 }
207- // Exact string is always the first len characters of the base string
208- _mongocrypt_buffer_from_data (& sets -> exact , sets -> base_string -> buf .data , folded_str_bytes_len );
232+ // Exact string is always equal to the base string up until the bad character
233+ _mongocrypt_buffer_from_data (& sets -> exact , sets -> base_string -> buf .data , ( uint32_t ) sets -> base_string -> buf . len - 1 );
209234 return sets ;
210235}
211236
212- mc_str_encode_sets_t * mc_text_search_str_encode (const mc_FLE2TextSearchInsertSpec_t * spec ,
213- mongocrypt_status_t * status ) {
214- BSON_ASSERT_PARAM (spec );
215- if (spec -> len > MAX_ENCODE_BYTE_LEN ) {
216- CLIENT_ERR ("StrEncode: String passed in was too long: String was %u bytes, but max is %u bytes" ,
217- spec -> len ,
218- MAX_ENCODE_BYTE_LEN );
219- return NULL ;
220- }
221- // TODO MONGOCRYPT-759 Implement and use CFold
222- if (!bson_utf8_validate (spec -> v , spec -> len , false /* allow_null */ )) {
223- CLIENT_ERR ("StrEncode: String passed in was not valid UTF-8" );
224- return NULL ;
225- }
226- uint32_t unfolded_codepoint_len = mc_get_utf8_codepoint_length (spec -> v , spec -> len );
227- if (unfolded_codepoint_len == 0 ) {
228- // Empty string: We set unfolded length to 1 so that we generate fake tokens.
229- unfolded_codepoint_len = 1 ;
230- }
231- return mc_text_search_str_encode_helper (spec , unfolded_codepoint_len , status );
232- }
233-
234237void mc_str_encode_sets_destroy (mc_str_encode_sets_t * sets ) {
235238 if (!sets ) {
236239 return ;
0 commit comments