#include "moar.h" static MVMuint8 BOM_UTF16LE[2] = { 0xFF, 0xFE }; static MVMuint8 BOM_UTF16BE[2] = { 0xFE, 0xFF }; #define UTF16_DECODE_BIG_ENDIAN 1 #define UTF16_DECODE_LITTLE_ENDIAN 2 #define UTF16_DECODE_AUTO_ENDIAN 4 MVM_STATIC_INLINE int has_little_endian_bom (MVMuint8 *buf8) { return buf8[0] == BOM_UTF16LE[0] && buf8[1] == BOM_UTF16LE[1]; } MVM_STATIC_INLINE int has_big_endian_bom (MVMuint8 *buf8) { return buf8[0] == BOM_UTF16BE[0] && buf8[1] == BOM_UTF16BE[1]; } MVM_STATIC_INLINE void init_utf16_decoder_state(MVMDecodeStream *ds, int setting) { if (!ds->decoder_state) { ds->decoder_state = MVM_malloc(sizeof(MVMint32)); } *((MVMint32*)ds->decoder_state) = setting; } #define utf16_decoder_state(ds) (*((MVMint32*)(ds)->decoder_state)) MVMuint32 MVM_string_utf16_decodestream_main(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMuint32 *stopper_chars, MVMDecodeStreamSeparators *seps, int endianess); MVMuint32 MVM_string_utf16_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMuint32 *stopper_chars, MVMDecodeStreamSeparators *seps) { if (!ds->decoder_state) { # ifdef MVM_BIGENDIAN init_utf16_decoder_state(ds, UTF16_DECODE_BIG_ENDIAN); # else init_utf16_decoder_state(ds, UTF16_DECODE_LITTLE_ENDIAN); # endif } return MVM_string_utf16_decodestream_main(tc, ds, stopper_chars, seps, UTF16_DECODE_AUTO_ENDIAN); } MVMuint32 MVM_string_utf16le_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMuint32 *stopper_chars, MVMDecodeStreamSeparators *seps) { init_utf16_decoder_state(ds, UTF16_DECODE_LITTLE_ENDIAN); return MVM_string_utf16_decodestream_main(tc, ds, stopper_chars, seps, UTF16_DECODE_LITTLE_ENDIAN); } MVMuint32 MVM_string_utf16be_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMuint32 *stopper_chars, MVMDecodeStreamSeparators *seps) { init_utf16_decoder_state(ds, UTF16_DECODE_BIG_ENDIAN); return MVM_string_utf16_decodestream_main(tc, ds, stopper_chars, seps, UTF16_DECODE_BIG_ENDIAN); } /* mostly from YAML-LibYAML */ /* Decodes using a decodestream. Decodes as far as it can with the input * buffers, or until a stopper is reached. */ MVMuint32 MVM_string_utf16_decodestream_main(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMuint32 *stopper_chars, MVMDecodeStreamSeparators *seps, int endianess) { MVMuint32 count = 0, total = 0; MVMuint32 bufsize; MVMGrapheme32 *buffer; MVMDecodeStreamBytes *cur_bytes; MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head; MVMint32 last_accept_pos; MVMuint32 reached_stopper; int low, high; /* Set to 1 to remove the BOM even when big endian or little endian are * explicitly specified. */ int remove_bom = 0; /* If there's no buffers, we're done. */ if (!ds->bytes_head) return 0; last_accept_pos = ds->bytes_head_pos; /* If we're asked for zero chars, also done. */ if (stopper_chars && *stopper_chars == 0) return 1; bufsize = ds->result_size_guess; buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); /* Decode each of the buffers. */ cur_bytes = ds->bytes_head; reached_stopper = 0; if (utf16_decoder_state(ds) == UTF16_DECODE_LITTLE_ENDIAN) { low = 0; high = 1; } else if (utf16_decoder_state(ds) == UTF16_DECODE_BIG_ENDIAN) { low = 1; high = 0; } else { MVM_free(buffer); MVM_exception_throw_adhoc(tc, "Unknown config setting in utf16 decodestream. This should never happen."); } while (cur_bytes) { /* Process this buffer. */ MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0; MVMuint8 *bytes = cur_bytes->bytes; if (ds->abs_byte_pos == 0 && pos + 1 < cur_bytes->length) { if (has_little_endian_bom(bytes + pos)) { /* Only change the setting if we are using standard 'utf16' decode * which is meant to detect the encoding. */ if (endianess == UTF16_DECODE_AUTO_ENDIAN) { low = 0; high = 1; last_accept_pos = pos += 2; utf16_decoder_state(ds) = UTF16_DECODE_LITTLE_ENDIAN; } /* If we see little endian BOM and we're set to utf16le, skip * the BOM. */ else if (endianess == UTF16_DECODE_LITTLE_ENDIAN && remove_bom) { last_accept_pos = pos += 2; } } else if (has_big_endian_bom(bytes + pos)) { if (endianess == UTF16_DECODE_AUTO_ENDIAN) { low = 1; high = 0; last_accept_pos = pos += 2; utf16_decoder_state(ds) = UTF16_DECODE_BIG_ENDIAN; } /* If we see a big endian BOM and we're set to utf16be, skip * the BOM. */ else if (endianess == UTF16_DECODE_BIG_ENDIAN && remove_bom) { last_accept_pos = pos += 2; } } } while (pos + 1 < cur_bytes->length) { MVMuint32 value = (bytes[pos+high] << 8) + bytes[pos+low]; MVMuint32 value2; if ((value & 0xFC00) == 0xDC00) { MVM_free(buffer); MVM_exception_throw_adhoc(tc, "Malformed UTF-16; unexpected low surrogate"); } if ((value & 0xFC00) == 0xD800) { /* high surrogate */ pos += 2; if (pos + 1 >= cur_bytes->length) { MVM_free(buffer); MVM_exception_throw_adhoc(tc, "Malformed UTF-16; incomplete surrogate pair"); } value2 = (bytes[pos+high] << 8) + bytes[pos+low]; if ((value2 & 0xFC00) != 0xDC00) { MVM_free(buffer); MVM_exception_throw_adhoc(tc, "Malformed UTF-16; incomplete surrogate pair"); } value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); } if (count == bufsize) { /* We filled the buffer. Attach this one to the buffers * linked list, and continue with a new one. */ MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize); buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); count = 0; } buffer[count++] = value; last_accept_bytes = cur_bytes; last_accept_pos = pos += 2; total++; if (MVM_string_decode_stream_maybe_sep(tc, seps, value) || (stopper_chars && *stopper_chars == total)) { reached_stopper = 1; goto done; } } cur_bytes = cur_bytes->next; } done: /* Attach what we successfully parsed as a result buffer, and trim away * what we chewed through. */ if (count) { MVM_string_decodestream_add_chars(tc, ds, buffer, count); } else { MVM_free(buffer); } MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos); return reached_stopper; } static MVMString * MVM_string_utf16_decode_main(MVMThreadContext *tc, const MVMObject *result_type, MVMuint8 *utf16_chars, size_t bytes, int endianess); MVMString * MVM_string_utf16be_decode(MVMThreadContext *tc, const MVMObject *result_type, char *utf16_chars, size_t bytes) { return MVM_string_utf16_decode_main(tc, result_type, (MVMuint8*)utf16_chars, bytes, UTF16_DECODE_BIG_ENDIAN); } MVMString * MVM_string_utf16le_decode(MVMThreadContext *tc, const MVMObject *result_type, char *utf16_chars, size_t bytes) { return MVM_string_utf16_decode_main(tc, result_type, (MVMuint8*)utf16_chars, bytes, UTF16_DECODE_LITTLE_ENDIAN); } MVMString * MVM_string_utf16_decode(MVMThreadContext *tc, const MVMObject *result_type, char *utf16_chars_in, size_t bytes) { MVMuint8 *utf16_chars = (MVMuint8*)utf16_chars_in; #ifdef MVM_BIGENDIAN int mode = UTF16_DECODE_BIG_ENDIAN; #else int mode = UTF16_DECODE_LITTLE_ENDIAN; #endif /* set the byte order if there's a BOM */ if (2 <= bytes) { if (has_little_endian_bom(utf16_chars)) { mode = UTF16_DECODE_LITTLE_ENDIAN; utf16_chars += 2; bytes -= 2; } else if (has_big_endian_bom(utf16_chars)) { mode = UTF16_DECODE_BIG_ENDIAN; utf16_chars += 2; bytes -= 2; } } return MVM_string_utf16_decode_main(tc, result_type, (MVMuint8*)utf16_chars, bytes, mode); } /* Decodes the specified number of bytes of utf16 into an NFG string, creating * a result of the specified type. The type must have the MVMString REPR. */ static MVMString * MVM_string_utf16_decode_main(MVMThreadContext *tc, const MVMObject *result_type, MVMuint8 *utf16_chars, size_t bytes, int endianess) { MVMString *result; MVMGrapheme32 *buffer; size_t str_pos = 0; MVMuint8 *utf16 = (MVMuint8 *)utf16_chars; MVMuint8 *utf16_end = NULL; int low, high; MVMNormalizer norm; MVMint32 ready; switch (endianess) { case UTF16_DECODE_BIG_ENDIAN: low = 1; high = 0; break; case UTF16_DECODE_LITTLE_ENDIAN: low = 0; high = 1; break; default: MVM_exception_throw_adhoc(tc, "Unknown mode set in utf16 decode. This should never happen."); } if (bytes % 2) { MVM_exception_throw_adhoc(tc, "Malformed UTF-16; odd number of bytes (%"PRIu64")", (MVMuint64)bytes); } utf16_end = utf16 + bytes; /* possibly allocating extra space; oh well */ buffer = MVM_malloc(sizeof(MVMGrapheme32) * bytes / 2); /* Need to normalize to NFG as we decode. */ MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); for (; utf16 < utf16_end; utf16 += 2) { MVMuint32 value = (utf16[high] << 8) + utf16[low]; MVMuint32 value2; MVMGrapheme32 g; if ((value & 0xFC00) == 0xDC00) { MVM_free(buffer); MVM_unicode_normalizer_cleanup(tc, &norm); MVM_exception_throw_adhoc(tc, "Malformed UTF-16; unexpected low surrogate"); } if ((value & 0xFC00) == 0xD800) { /* high surrogate */ utf16 += 2; if (utf16 == utf16_end) { MVM_free(buffer); MVM_unicode_normalizer_cleanup(tc, &norm); MVM_exception_throw_adhoc(tc, "Malformed UTF-16; incomplete surrogate pair"); } value2 = (utf16[high] << 8) + utf16[low]; if ((value2 & 0xFC00) != 0xDC00) { MVM_free(buffer); MVM_unicode_normalizer_cleanup(tc, &norm); MVM_exception_throw_adhoc(tc, "Malformed UTF-16; incomplete surrogate pair"); } value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); } /* TODO: check for invalid values */ ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, value, &g); if (ready) { buffer[str_pos++] = g; while (--ready > 0) buffer[str_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); } } /* Get any final graphemes from the normalizer, and clean it up. */ MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); while (ready--) buffer[str_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type)); result->body.storage.blob_32 = buffer; result->body.storage_type = MVM_STRING_GRAPHEME_32; result->body.num_graphs = str_pos; return result; } MVM_STATIC_INLINE MVMuint16 swap_bytes(MVMuint16 uint16, int enable_byte_swap) { return enable_byte_swap ? (uint16 << 8) | (uint16 >> 8) : uint16; } char * MVM_string_utf16_encode_substr_main(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines, int endianess); /* Encodes the specified substring to utf16. The result string is NULL terminated, but * the specified size is the non-null part. (This being UTF-16, there are 2 null bytes * on the end.) */ char * MVM_string_utf16be_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines) { return MVM_string_utf16_encode_substr_main(tc, str, output_size, start, length, replacement, translate_newlines, UTF16_DECODE_BIG_ENDIAN); } char * MVM_string_utf16le_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines) { return MVM_string_utf16_encode_substr_main(tc, str, output_size, start, length, replacement, translate_newlines, UTF16_DECODE_LITTLE_ENDIAN); } char * MVM_string_utf16_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines) { return MVM_string_utf16_encode_substr_main(tc, str, output_size, start, length, replacement, translate_newlines, UTF16_DECODE_AUTO_ENDIAN); } char * MVM_string_utf16_encode_substr_main(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines, int endianess) { MVMStringIndex strgraphs = MVM_string_graphs(tc, str); MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - start : length); MVMuint16 *result; MVMuint16 *result_pos; MVMCodepointIter ci; MVMuint8 *repl_bytes = NULL; MVMuint64 repl_length = 0; MVMint32 alloc_size; MVMuint64 scratch_space = 0; int enable_byte_swap = 0; #ifdef MVM_BIGENDIAN if (endianess == UTF16_DECODE_LITTLE_ENDIAN) enable_byte_swap = 1; #else if (endianess == UTF16_DECODE_BIG_ENDIAN) enable_byte_swap = 1; #endif /* must check start first since it's used in the length check */ if (start < 0 || start > strgraphs) MVM_exception_throw_adhoc(tc, "start (%"PRId64") out of range (0..%"PRIu32")", start, strgraphs); if (start + lengthu > strgraphs) MVM_exception_throw_adhoc(tc, "length (%"PRId64") out of range (0..%"PRIu32")", length, strgraphs); if (replacement) repl_bytes = (MVMuint8 *) MVM_string_utf16_encode_substr(tc, replacement, &repl_length, 0, -1, NULL, translate_newlines); alloc_size = lengthu * 2; result = MVM_malloc(alloc_size + 2); result_pos = result; MVM_string_ci_init(tc, &ci, str, translate_newlines, 0); while (MVM_string_ci_has_more(tc, &ci)) { int bytes_needed; MVMCodepoint value = MVM_string_ci_get_codepoint(tc, &ci); if (value < 0x10000) { bytes_needed = 2; } else if (value <= 0x1FFFFF) { bytes_needed = 4; } else { bytes_needed = repl_length; } while ((alloc_size - 2 * (result_pos - result)) < bytes_needed) { MVMuint16 *new_result; alloc_size *= 2; new_result = MVM_realloc(result, alloc_size + 2); result_pos = new_result + (result_pos - result); result = new_result; } if (value < 0x10000) { result_pos[0] = swap_bytes(value, enable_byte_swap); result_pos++; } else if (value <= 0x1FFFFF) { value -= 0x10000; result_pos[0] = swap_bytes(0xD800 + (value >> 10), enable_byte_swap); result_pos[1] = swap_bytes(0xDC00 + (value & 0x3FF), enable_byte_swap); result_pos += 2; } else if (replacement) { memcpy(result_pos, repl_bytes, repl_length); result_pos += repl_length/2; } else { MVM_free(result); MVM_free(repl_bytes); MVM_exception_throw_adhoc(tc, "Error encoding UTF-16 string: could not encode codepoint %d", value); } } result_pos[0] = 0; if (!output_size) output_size = &scratch_space; *output_size = (char *)result_pos - (char *)result; result = MVM_realloc(result, *output_size + 2); MVM_free(repl_bytes); return (char *)result; } /* Encodes the whole string, double-NULL terminated. */ char * MVM_string_utf16_encode(MVMThreadContext *tc, MVMString *str, MVMint32 translate_newlines) { return MVM_string_utf16_encode_substr(tc, str, NULL, 0, -1, NULL, translate_newlines); }