#include "moar.h" /* A decode stream represents an on-going decoding process, from bytes into * characters. Bytes can be contributed to the decode stream, and chars can be * obtained. Byte buffers and decoded char buffers are kept in linked lists. * Note that characters may start at the end of one byte buffer and finish in * the next, which is taken care of by the logic in here and the decoders * themselves. Additionally, normalization may be applied using the normalizer * in the decode stream, at the discretion of the encoding in question (some, * such as ASCII and Latin-1, are normalized by definition). */ #define DECODE_NOT_EOF 0 #define DECODE_EOF 1 /* Creates a new decoding stream. */ MVMDecodeStream * MVM_string_decodestream_create(MVMThreadContext *tc, MVMint32 encoding, MVMint64 abs_byte_pos, MVMint32 translate_newlines) { MVMDecodeStream *ds = MVM_calloc(1, sizeof(MVMDecodeStream)); ds->encoding = encoding; ds->abs_byte_pos = abs_byte_pos; MVM_unicode_normalizer_init(tc, &(ds->norm), MVM_NORMALIZE_NFG); if (translate_newlines) MVM_unicode_normalizer_translate_newlines(tc, &(ds->norm)); ds->result_size_guess = 64; return ds; } /* Adds another byte buffer into the decoding stream. */ void MVM_string_decodestream_add_bytes(MVMThreadContext *tc, MVMDecodeStream *ds, MVMuint8 *bytes, MVMint32 length) { if (length > 0) { MVMDecodeStreamBytes *new_bytes = MVM_calloc(1, sizeof(MVMDecodeStreamBytes)); new_bytes->bytes = bytes; new_bytes->length = length; if (ds->bytes_tail) ds->bytes_tail->next = new_bytes; ds->bytes_tail = new_bytes; if (!ds->bytes_head) ds->bytes_head = new_bytes; } else { /* It's empty, so free the buffer right away and don't add. */ MVM_free(bytes); } } /* Adds another char result buffer into the decoding stream. */ void MVM_string_decodestream_add_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 *chars, MVMint32 length) { MVMDecodeStreamChars *new_chars; if (ds->chars_reuse) { new_chars = ds->chars_reuse; ds->chars_reuse = NULL; } else { new_chars = MVM_malloc(sizeof(MVMDecodeStreamChars)); } new_chars->chars = chars; new_chars->length = length; new_chars->next = NULL; if (ds->chars_tail) ds->chars_tail->next = new_chars; ds->chars_tail = new_chars; if (!ds->chars_head) ds->chars_head = new_chars; } /* Internal function to free a chars result structure, putting it into the * re-use slot if it's empty. */ static void free_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMDecodeStreamChars *chars) { if (ds->chars_reuse) MVM_free(chars); else ds->chars_reuse = chars; } /* Throws away byte buffers no longer needed. */ void MVM_string_decodestream_discard_to(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMDecodeStreamBytes *bytes, MVMint32 pos) { while (ds->bytes_head != bytes) { MVMDecodeStreamBytes *discard = ds->bytes_head; ds->abs_byte_pos += discard->length - ds->bytes_head_pos; ds->bytes_head = discard->next; ds->bytes_head_pos = 0; MVM_free(discard->bytes); MVM_free(discard); } if (!ds->bytes_head) { if (MVM_LIKELY(pos == 0)) return; /* Guard against null pointer dereference below. */ else MVM_exception_throw_adhoc(tc, "Unknown error encountered in MVM_string_decodestream_discard_to, pos = %"PRId32"", pos); } if (ds->bytes_head->length == pos) { /* We ate all of the new head buffer too; also free it. */ MVMDecodeStreamBytes *discard = ds->bytes_head; ds->abs_byte_pos += discard->length - ds->bytes_head_pos; ds->bytes_head = discard->next; ds->bytes_head_pos = 0; MVM_free(discard->bytes); MVM_free(discard); if (ds->bytes_head == NULL) ds->bytes_tail = NULL; } else { ds->abs_byte_pos += pos - ds->bytes_head_pos; ds->bytes_head_pos = pos; } } /* Does a decode run, selected by encoding. Returns non-zero if we actually * decoded more chars. */ #define RUN_DECODE_NOTHING_DECODED 0 #define RUN_DECODE_STOPPER_NOT_REACHED 1 #define RUN_DECODE_STOPPER_REACHED 2 static MVMuint32 run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMuint32 *stopper_chars, MVMDecodeStreamSeparators *sep_spec, MVMint32 eof) { MVMDecodeStreamChars *prev_chars_tail = ds->chars_tail; MVMuint32 reached_stopper; switch (ds->encoding) { case MVM_encoding_type_utf8: reached_stopper = MVM_string_utf8_decodestream(tc, ds, stopper_chars, sep_spec); break; case MVM_encoding_type_ascii: reached_stopper = MVM_string_ascii_decodestream(tc, ds, stopper_chars, sep_spec); break; case MVM_encoding_type_latin1: reached_stopper = MVM_string_latin1_decodestream(tc, ds, stopper_chars, sep_spec); break; case MVM_encoding_type_windows1252: reached_stopper = MVM_string_windows1252_decodestream(tc, ds, stopper_chars, sep_spec); break; case MVM_encoding_type_windows1251: reached_stopper = MVM_string_windows1251_decodestream(tc, ds, stopper_chars, sep_spec); break; case MVM_encoding_type_shiftjis: reached_stopper = MVM_string_shiftjis_decodestream(tc, ds, stopper_chars, sep_spec); break; case MVM_encoding_type_utf8_c8: reached_stopper = MVM_string_utf8_c8_decodestream(tc, ds, stopper_chars, sep_spec, eof); break; case MVM_encoding_type_utf16: reached_stopper = MVM_string_utf16_decodestream(tc, ds, stopper_chars, sep_spec); break; case MVM_encoding_type_utf16be: reached_stopper = MVM_string_utf16be_decodestream(tc, ds, stopper_chars, sep_spec); break; case MVM_encoding_type_utf16le: reached_stopper = MVM_string_utf16le_decodestream(tc, ds, stopper_chars, sep_spec); break; case MVM_encoding_type_gb2312: reached_stopper = MVM_string_gb2312_decodestream(tc, ds, stopper_chars, sep_spec); break; case MVM_encoding_type_gb18030: reached_stopper = MVM_string_gb18030_decodestream(tc, ds, stopper_chars, sep_spec); break; default: if (ds->encoding < MVM_encoding_type_MIN || MVM_encoding_type_MAX < ds->encoding) MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRIi32, ds->encoding); else MVM_exception_throw_adhoc(tc, "Streaming decode not yet implemented for %s encoding", MVM_string_encoding_cname(tc, ds->encoding)); } if (ds->chars_tail == prev_chars_tail) return RUN_DECODE_NOTHING_DECODED; else if (reached_stopper) return RUN_DECODE_STOPPER_REACHED; else return RUN_DECODE_STOPPER_NOT_REACHED; } /* In situations where we have hit EOF, we need to decode what's left and flush * the normalization buffer also. */ static void reached_eof(MVMThreadContext *tc, MVMDecodeStream *ds) { /* Decode all the things. */ if (ds->bytes_head) run_decode(tc, ds, NULL, NULL, DECODE_EOF); if (ds->bytes_head){ MVMDecodeStreamBytes *bh = ds->bytes_head; MVMint32 i = ds->bytes_head_pos; char dumped[16] = " xx xx xx xx..."; size_t j = 0; while(bh && j < sizeof(dumped) - 4){ if(i < bh->length){ int r = snprintf(&dumped[j], sizeof(dumped)-j, " %02hhx", bh->bytes[i++]); if(r < 0){ j = 0; break; } j += r; } if(i >= bh->length){ bh = bh->next; i = 0; } } if(j == 0){ MVM_exception_throw_adhoc(tc, "Incomplete character " "at the end of a stream"); }else{ if(bh) dumped[12] = '.'; MVM_exception_throw_adhoc(tc, "Incomplete character " "near bytes%s at the end of a stream", dumped); } } /* If there's some things left in the normalization buffer, take them. */ MVM_unicode_normalizer_eof(tc, &(ds->norm)); if (MVM_unicode_normalizer_available(tc, &(ds->norm))) { MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm)); MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32)); MVMint32 count = 0; while (ready--) buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm)); MVM_string_decodestream_add_chars(tc, ds, buffer, count); } } /* Gets the specified number of characters. If we are not yet able to decode * that many, returns NULL. This may mean more input buffers are needed. The * exclude parameter specifies a number of chars that should be taken from the * input buffer, but not included in the result string (for chomping a line * separator). */ static MVMuint32 missing_chars(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMint32 wanted) { MVMint32 got = 0; MVMDecodeStreamChars *cur_chars = ds->chars_head; while (cur_chars && got < wanted) { if (cur_chars == ds->chars_head) got += cur_chars->length - ds->chars_head_pos; else got += cur_chars->length; cur_chars = cur_chars->next; } return got >= wanted ? 0 : wanted - got; } static MVMString * take_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 chars, MVMint32 exclude) { MVMString *result; MVMint32 found = 0; MVMint32 result_found = 0; MVMint32 result_chars = chars - exclude; if (result_chars < 0) MVM_exception_throw_adhoc(tc, "DecodeStream take_chars: chars - exclude < 0 should never happen, got (%"PRId32")", result_chars); result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); result->body.storage_type = MVM_STRING_GRAPHEME_32; result->body.num_graphs = result_chars; /* In the best case, the head char buffer has exactly what we need. This * will typically happen when it a steady state of decoding lines. */ if (ds->chars_head->length == chars && ds->chars_head_pos == 0) { MVMDecodeStreamChars *cur_chars = ds->chars_head; result->body.storage.blob_32 = cur_chars->chars; ds->chars_head = cur_chars->next; if (ds->chars_head == NULL) ds->chars_tail = NULL; free_chars(tc, ds, cur_chars); } /* Otherwise, need to take and copy. */ else { result->body.storage.blob_32 = MVM_malloc(result_chars * sizeof(MVMGrapheme32)); while (found < chars) { MVMDecodeStreamChars *cur_chars = ds->chars_head; MVMint32 available = cur_chars->length - ds->chars_head_pos; if (available <= chars - found) { /* We need all that's left in this buffer and likely * more. */ MVMDecodeStreamChars *next_chars = cur_chars->next; if (available <= result_chars - result_found) { memcpy(result->body.storage.blob_32 + result_found, cur_chars->chars + ds->chars_head_pos, available * sizeof(MVMGrapheme32)); result_found += available; } else { MVMint32 to_copy = result_chars - result_found; memcpy(result->body.storage.blob_32 + result_found, cur_chars->chars + ds->chars_head_pos, to_copy * sizeof(MVMGrapheme32)); result_found += to_copy; } found += available; MVM_free(cur_chars->chars); free_chars(tc, ds, cur_chars); ds->chars_head = next_chars; ds->chars_head_pos = 0; if (ds->chars_head == NULL) ds->chars_tail = NULL; } else { /* There's enough in this buffer to satisfy us, and we'll leave * some behind. */ MVMint32 take = chars - found; MVMint32 to_copy = result_chars - result_found; memcpy(result->body.storage.blob_32 + result_found, cur_chars->chars + ds->chars_head_pos, to_copy * sizeof(MVMGrapheme32)); result_found += to_copy; found += take; ds->chars_head_pos += take; } } } return result; } MVMString * MVM_string_decodestream_get_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 chars, MVMint64 eof) { MVMuint32 missing; /* If we request nothing, give empty string. */ if (chars == 0) return tc->instance->str_consts.empty; /* If we don't already have enough chars, try and decode more. */ missing = missing_chars(tc, ds, chars); ds->result_size_guess = missing; if (missing) run_decode(tc, ds, &missing, NULL, DECODE_NOT_EOF); /* If we've got enough, assemble a string. Otherwise, flag EOF and retry, * falling back to returning what's available. */ if (missing_chars(tc, ds, chars) == 0) { return take_chars(tc, ds, chars, 0); } else if (eof) { reached_eof(tc, ds); return missing_chars(tc, ds, chars) == 0 ? take_chars(tc, ds, chars, 0) : MVM_string_decodestream_get_all(tc, ds); } else { return NULL; } } /* Gets characters up until one of the specified separators is encountered. If * we do not encounter it, returns 0. This may mean more input buffers are needed * or that we reached the end of the stream. Note that it assumes the separator * will exist near the end of the buffer, if it occurs at all, due to decode * streams looking for stoppers. */ static MVMint32 have_separator(MVMThreadContext *tc, MVMDecodeStreamChars *start_chars, MVMint32 start_pos, MVMDecodeStreamSeparators *sep_spec, MVMint32 sep_idx, MVMint32 sep_graph_pos) { MVMint32 sep_pos = 1; MVMint32 sep_length = sep_spec->sep_lengths[sep_idx]; MVMDecodeStreamChars *cur_chars = start_chars; while (cur_chars) { MVMint32 start = cur_chars == start_chars ? start_pos : 0; MVMint32 i; for (i = start; i < cur_chars->length; i++) { if (cur_chars->chars[i] != sep_spec->sep_graphemes[sep_graph_pos]) return 0; sep_pos++; if (sep_pos == sep_length) return 1; sep_graph_pos++; } cur_chars = cur_chars->next; } return 0; } static MVMint32 find_separator(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMDecodeStreamSeparators *sep_spec, MVMint32 *sep_length, int eof) { MVMint32 sep_loc = 0; MVMDecodeStreamChars *cur_chars = ds->chars_head; /* First, skip over any buffers we need not consider. */ MVMint32 max_sep_length = sep_spec->max_sep_length; while (cur_chars && cur_chars->next) { if (cur_chars->next->length < max_sep_length) break; sep_loc += cur_chars->length; cur_chars = cur_chars->next; } /* Now scan for the separator. */ while (cur_chars) { MVMint32 i, j; MVMint32 start; if (eof) { start = cur_chars == ds->chars_head ? ds->chars_head_pos : 0; } else { start = cur_chars->length - max_sep_length; if (cur_chars == ds->chars_head) { if (start >= ds->chars_head_pos) sep_loc += start - ds->chars_head_pos; else start = ds->chars_head_pos; } else { if (start >= 0) sep_loc += start; else start = 0; } } for (i = start; i < cur_chars->length; i++) { MVMint32 sep_graph_pos = 0; MVMGrapheme32 cur_char = cur_chars->chars[i]; sep_loc++; for (j = 0; j < sep_spec->num_seps; j++) { if (sep_spec->sep_graphemes[sep_graph_pos] == cur_char) { if (sep_spec->sep_lengths[j] == 1) { *sep_length = 1; return sep_loc; } else if (have_separator(tc, cur_chars, i + 1, sep_spec, j, sep_graph_pos + 1)) { *sep_length = sep_spec->sep_lengths[j]; sep_loc += sep_spec->sep_lengths[j] - 1; return sep_loc; } } sep_graph_pos += sep_spec->sep_lengths[j]; } } cur_chars = cur_chars->next; } return 0; } MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMDecodeStreamSeparators *sep_spec, MVMint32 chomp) { MVMint32 sep_loc, sep_length; /* Look for separator, trying more decoding if it fails. We get the place * just beyond the separator, so can use take_chars to get what's need. * Note that decoders are only responsible for finding the final char of * the separator, so we may need to loop a few times around this. */ sep_loc = find_separator(tc, ds, sep_spec, &sep_length, 0); while (!sep_loc) { MVMuint32 decode_outcome = run_decode(tc, ds, NULL, sep_spec, DECODE_NOT_EOF); if (decode_outcome == RUN_DECODE_NOTHING_DECODED) break; if (decode_outcome == RUN_DECODE_STOPPER_REACHED) sep_loc = find_separator(tc, ds, sep_spec, &sep_length, 0); } if (sep_loc) { /* Use this line length as a guesstimate of the next, unless it's tiny * in which case we treat it as an outlier (probably an empty line or * some such). Also round up and to a nice power of 2. */ if (sep_loc > 32) ds->result_size_guess = (sep_loc << 1) & ~0xF; return take_chars(tc, ds, sep_loc, chomp ? sep_length : 0); } else { return NULL; } } /* Variant of MVM_string_decodestream_get_until_sep that is called when we * reach EOF. Trims the final separator if there is one, or returns the last * line without the EOF marker. */ MVMString * MVM_string_decodestream_get_until_sep_eof(MVMThreadContext *tc, MVMDecodeStream *ds, MVMDecodeStreamSeparators *sep_spec, MVMint32 chomp) { MVMint32 sep_loc, sep_length; /* Decode anything remaining and flush normalization buffer. */ reached_eof(tc, ds); /* Look for separator, which should by now be at the end, and chomp it * off if needed. */ sep_loc = find_separator(tc, ds, sep_spec, &sep_length, 1); if (sep_loc) return take_chars(tc, ds, sep_loc, chomp ? sep_length : 0); /* Otherwise, take all remaining chars. */ return MVM_string_decodestream_get_all(tc, ds); } /* Produces a string consisting of the characters available now in all decdoed * buffers. */ static MVMString * get_all_in_buffer(MVMThreadContext *tc, MVMDecodeStream *ds) { MVMString *result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); result->body.storage_type = MVM_STRING_GRAPHEME_32; /* If there's no codepoint buffer, then return the empty string. */ if (!ds->chars_head) { result->body.storage.blob_32 = NULL; result->body.num_graphs = 0; } /* If there's exactly one resulting codepoint buffer and we swallowed none * of it, just use it. */ else if (ds->chars_head == ds->chars_tail && ds->chars_head_pos == 0) { /* Set up result string. */ result->body.storage.blob_32 = ds->chars_head->chars; result->body.num_graphs = ds->chars_head->length; /* Don't free the buffer's memory itself, just the holder, as we * stole that for the buffer into the string above. */ free_chars(tc, ds, ds->chars_head); ds->chars_head = ds->chars_tail = NULL; } /* Otherwise, need to assemble all the things. */ else { /* Calculate length. */ MVMint32 length = 0, pos = 0; MVMDecodeStreamChars *cur_chars = ds->chars_head; while (cur_chars) { if (cur_chars == ds->chars_head) length += cur_chars->length - ds->chars_head_pos; else length += cur_chars->length; cur_chars = cur_chars->next; } /* Allocate a result buffer of the right size. */ result->body.storage.blob_32 = MVM_malloc(length * sizeof(MVMGrapheme32)); result->body.num_graphs = length; /* Copy all the things into the target, freeing as we go. */ cur_chars = ds->chars_head; while (cur_chars) { MVMDecodeStreamChars *next_chars = cur_chars->next; if (cur_chars == ds->chars_head) { MVMint32 to_copy = ds->chars_head->length - ds->chars_head_pos; memcpy(result->body.storage.blob_32 + pos, cur_chars->chars + ds->chars_head_pos, to_copy * sizeof(MVMGrapheme32)); pos += to_copy; } else { memcpy(result->body.storage.blob_32 + pos, cur_chars->chars, cur_chars->length * sizeof(MVMGrapheme32)); pos += cur_chars->length; } MVM_free(cur_chars->chars); free_chars(tc, ds, cur_chars); cur_chars = next_chars; } ds->chars_head = ds->chars_tail = NULL; } return result; } /* Decodes all the buffers, signals EOF to flush any normalization buffers, and * returns a string of all decoded chars. */ MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds) { reached_eof(tc, ds); return get_all_in_buffer(tc, ds); } /* Decodes all the buffers we have, and returns a string of all decoded chars. * There may still be more to read after this, due to incomplete multi-byte * or multi-codepoint sequences that are not yet completely processed. */ MVMString * MVM_string_decodestream_get_available(MVMThreadContext *tc, MVMDecodeStream *ds) { if (ds->bytes_head) { ds->result_size_guess = ds->bytes_head->length; run_decode(tc, ds, NULL, NULL, DECODE_NOT_EOF); } return get_all_in_buffer(tc, ds); } /* Checks if we have the number of bytes requested. */ MVMint64 MVM_string_decodestream_have_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMint32 bytes) { MVMDecodeStreamBytes *cur_bytes = ds->bytes_head; MVMint32 found = 0; while (cur_bytes) { found += cur_bytes == ds->bytes_head ? cur_bytes->length - ds->bytes_head_pos : cur_bytes->length; if (found >= bytes) return 1; cur_bytes = cur_bytes->next; } return 0; } /* Gets the number of bytes available. */ MVMint64 MVM_string_decodestream_bytes_available(MVMThreadContext *tc, const MVMDecodeStream *ds) { MVMDecodeStreamBytes *cur_bytes = ds->bytes_head; MVMint32 available = 0; while (cur_bytes) { available += cur_bytes == ds->bytes_head ? cur_bytes->length - ds->bytes_head_pos : cur_bytes->length; cur_bytes = cur_bytes->next; } return available; } /* Copies up to the requested number of bytes into the supplied buffer, and * returns the number of bytes we actually copied. Takes from from the start * of the stream. */ MVMint64 MVM_string_decodestream_bytes_to_buf(MVMThreadContext *tc, MVMDecodeStream *ds, MVMuint8 **buf, MVMint32 bytes) { MVMint32 taken = 0; *buf = NULL; while (taken < bytes && ds->bytes_head) { /* Take what we can. */ MVMDecodeStreamBytes *cur_bytes = ds->bytes_head; MVMint32 required = bytes - taken; MVMint32 available = cur_bytes->length - ds->bytes_head_pos; if (available <= required) { /* Take everything in this buffer and remove it. */ if (!*buf) *buf = MVM_malloc(cur_bytes->next ? bytes : available); memcpy(*buf + taken, cur_bytes->bytes + ds->bytes_head_pos, available); taken += available; ds->bytes_head = cur_bytes->next; ds->bytes_head_pos = 0; MVM_free(cur_bytes->bytes); MVM_free(cur_bytes); } else { /* Just take what we need. */ if (!*buf) *buf = MVM_malloc(required); memcpy(*buf + taken, cur_bytes->bytes + ds->bytes_head_pos, required); taken += required; ds->bytes_head_pos += required; } } if (ds->bytes_head == NULL) ds->bytes_tail = NULL; ds->abs_byte_pos += taken; return taken; } /* Gets the absolute byte offset (the amount we started with plus what we've * chewed and handed back in decoded characters). */ MVMint64 MVM_string_decodestream_tell_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds) { return ds->abs_byte_pos; } /* Checks if the decode stream is empty. */ MVMint32 MVM_string_decodestream_is_empty(MVMThreadContext *tc, MVMDecodeStream *ds) { return !ds->bytes_head && !ds->chars_head && MVM_unicode_normalizer_empty(tc, &(ds->norm)); } /* Destroys a decoding stream, freeing all associated memory (including the * buffers). */ void MVM_string_decodestream_destroy(MVMThreadContext *tc, MVMDecodeStream *ds) { MVMDecodeStreamBytes *cur_bytes = ds->bytes_head; MVMDecodeStreamChars *cur_chars = ds->chars_head; while (cur_bytes) { MVMDecodeStreamBytes *next_bytes = cur_bytes->next; MVM_free(cur_bytes->bytes); MVM_free(cur_bytes); cur_bytes = next_bytes; } while (cur_chars) { MVMDecodeStreamChars *next_chars = cur_chars->next; MVM_free(cur_chars->chars); MVM_free(cur_chars); cur_chars = next_chars; } MVM_unicode_normalizer_cleanup(tc, &(ds->norm)); MVM_free(ds->decoder_state); MVM_free(ds->chars_reuse); MVM_free(ds); } /* Calculates and caches various bits of information about separators, for * faster line reading. */ static void cache_sep_info(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) { MVMGrapheme32 *final_graphemes = MVM_malloc(sep_spec->num_seps * sizeof(MVMGrapheme32)); MVMint32 max_final_grapheme = -1; MVMint32 max_sep_length = 1; MVMint32 cur_sep_pos = 0; MVMint32 i; for (i = 0; i < sep_spec->num_seps; i++) { MVMint32 length = sep_spec->sep_lengths[i]; if (length > max_sep_length) max_sep_length = length; cur_sep_pos += length; final_graphemes[i] = sep_spec->sep_graphemes[cur_sep_pos - 1]; if (final_graphemes[i] > max_final_grapheme) max_final_grapheme = final_graphemes[i]; } sep_spec->max_sep_length = max_sep_length; sep_spec->final_graphemes = final_graphemes; sep_spec->max_final_grapheme = max_final_grapheme; } /* Sets a decode stream separator to its default value. */ void MVM_string_decode_stream_sep_default(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) { sep_spec->num_seps = 2; sep_spec->sep_lengths = MVM_malloc(sep_spec->num_seps * sizeof(MVMint32)); sep_spec->sep_graphemes = MVM_malloc(sep_spec->num_seps * sizeof(MVMGrapheme32)); sep_spec->sep_lengths[0] = 1; sep_spec->sep_graphemes[0] = '\n'; sep_spec->sep_lengths[1] = 1; sep_spec->sep_graphemes[1] = MVM_nfg_crlf_grapheme(tc); cache_sep_info(tc, sep_spec); } /* Takes a string and sets it up as a decode stream separator. */ void MVM_string_decode_stream_sep_from_strings(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec, MVMString **seps, MVMint32 num_seps) { MVMGraphemeIter gi; MVMint32 i, graph_length, graph_pos, *sep_lengths; if (num_seps > 0xFFF) MVM_exception_throw_adhoc(tc, "Too many line separators (%"PRId32"), max allowed is 4095", num_seps); MVM_free(sep_spec->sep_lengths); MVM_free(sep_spec->sep_graphemes); MVM_free(sep_spec->final_graphemes); sep_spec->num_seps = num_seps; sep_lengths = MVM_malloc(num_seps * sizeof(MVMint32)); graph_length = 0; for (i = 0; i < num_seps; i++) { MVMuint32 num_graphs = MVM_string_graphs(tc, seps[i]); if (num_graphs > 0xFFFF) { MVM_free(sep_lengths); MVM_exception_throw_adhoc(tc, "Line separator (%"PRIu32") too long, max allowed is 65535", num_graphs); } sep_lengths[i] = num_graphs; graph_length += num_graphs; } sep_spec->sep_lengths = sep_lengths; sep_spec->sep_graphemes = MVM_malloc(graph_length * sizeof(MVMGrapheme32)); graph_pos = 0; for (i = 0; i < num_seps; i++) { MVM_string_gi_init(tc, &gi, seps[i]); while (MVM_string_gi_has_more(tc, &gi)) sep_spec->sep_graphemes[graph_pos++] = MVM_string_gi_get_grapheme(tc, &gi); } cache_sep_info(tc, sep_spec); } /* Cleans up memory associated with a stream separator set. */ void MVM_string_decode_stream_sep_destroy(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) { MVM_free(sep_spec->sep_lengths); MVM_free(sep_spec->sep_graphemes); MVM_free(sep_spec->final_graphemes); }