diff --git a/include/parrot/encoding.h b/include/parrot/encoding.h index 2771a7e..23023bb 100644 --- a/include/parrot/encoding.h +++ b/include/parrot/encoding.h @@ -37,6 +37,16 @@ struct string_iterator_t; /* s. parrot/string.h */ typedef void (*encoding_iter_init_t)(PARROT_INTERP, const STRING *src, struct string_iterator_t *); +typedef UINTVAL (*encoding_iter_get_t)( + PARROT_INTERP, const STRING *str, const String_iter *i, INTVAL offset); +typedef void (*encoding_iter_skip_t)( + PARROT_INTERP, const STRING *str, String_iter *i, INTVAL skip); +typedef UINTVAL (*encoding_iter_get_and_advance_t)( + PARROT_INTERP, const STRING *str, String_iter *i); +typedef void (*encoding_iter_set_and_advance_t)( + PARROT_INTERP, STRING *str, String_iter *i, UINTVAL c); +typedef void (*encoding_iter_set_position_t)( + PARROT_INTERP, const STRING *str, String_iter *i, UINTVAL pos); struct _encoding { ARGIN(const char *name); @@ -57,6 +67,11 @@ struct _encoding { encoding_bytes_t bytes; encoding_iter_init_t iter_init; encoding_find_cclass_t find_cclass; + encoding_iter_get_t iter_get; + encoding_iter_skip_t iter_skip; + encoding_iter_get_and_advance_t iter_get_and_advance; + encoding_iter_set_and_advance_t iter_set_and_advance; + encoding_iter_set_position_t iter_set_position; }; typedef struct _encoding ENCODING; diff --git a/include/parrot/string.h b/include/parrot/string.h index fb6a3be..7d87f8e 100644 --- a/include/parrot/string.h +++ b/include/parrot/string.h @@ -37,6 +37,19 @@ typedef struct string_iterator_t { void (*set_position)(PARROT_INTERP, struct string_iterator_t *i, UINTVAL pos); } String_iter; +#define STRING_ITER_INIT(i, iter) \ + (iter)->charpos = (iter)->bytepos = 0 +#define STRING_ITER_GET(i, str, iter, offset) \ + ((str)->encoding)->iter_get((i), (str), (iter), (offset)) +#define STRING_ITER_SKIP(i, str, iter, skip) \ + ((str)->encoding)->iter_skip((i), (str), (iter), (skip)) +#define STRING_ITER_GET_AND_ADVANCE(i, str, iter) \ + ((str)->encoding)->iter_get_and_advance((i), (str), (iter)) +#define STRING_ITER_SET_AND_ADVANCE(i, str, iter, c) \ + ((str)->encoding)->iter_set_and_advance((i), (str), (iter), (c)) +#define STRING_ITER_SET_POSITION(i, str, iter, pos) \ + ((str)->encoding)->iter_set_position((i), (str), (iter), (pos)) + #define STREQ(x, y) (strcmp((x), (y))==0) #define STRNEQ(x, y) (strcmp((x), (y))!=0) diff --git a/include/parrot/string_funcs.h b/include/parrot/string_funcs.h index f54af57..d445368 100644 --- a/include/parrot/string_funcs.h +++ b/include/parrot/string_funcs.h @@ -253,6 +253,32 @@ PARROT_EXPORT INTVAL Parrot_str_is_null(SHIM_INTERP, ARGIN_NULLOK(const STRING *s)); PARROT_EXPORT +INTVAL Parrot_str_iter_index(PARROT_INTERP, + ARGIN(const STRING *src), + ARGMOD(String_iter *start), + ARGMOD(String_iter *end), + ARGIN(const STRING *search)) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + __attribute__nonnull__(4) + __attribute__nonnull__(5) + FUNC_MODIFIES(*start) + FUNC_MODIFIES(*end); + +PARROT_EXPORT +PARROT_CANNOT_RETURN_NULL +PARROT_WARN_UNUSED_RESULT +STRING * Parrot_str_iter_substr(PARROT_INTERP, + ARGMOD(STRING *str), + ARGIN(const String_iter *l), + ARGIN_NULLOK(const String_iter *r)) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*str); + +PARROT_EXPORT PARROT_WARN_UNUSED_RESULT PARROT_CANNOT_RETURN_NULL STRING* Parrot_str_join(PARROT_INTERP, @@ -631,6 +657,16 @@ STRING* Parrot_str_from_uint(PARROT_INTERP, PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(s)) #define ASSERT_ARGS_Parrot_str_is_null __attribute__unused__ int _ASSERT_ARGS_CHECK = (0) +#define ASSERT_ARGS_Parrot_str_iter_index __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(src) \ + , PARROT_ASSERT_ARG(start) \ + , PARROT_ASSERT_ARG(end) \ + , PARROT_ASSERT_ARG(search)) +#define ASSERT_ARGS_Parrot_str_iter_substr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(l)) #define ASSERT_ARGS_Parrot_str_join __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(ar)) diff --git a/src/io/utf8.c b/src/io/utf8.c index 0df3d22..f2b3b5d 100644 --- a/src/io/utf8.c +++ b/src/io/utf8.c @@ -57,7 +57,7 @@ Parrot_io_read_utf8(PARROT_INTERP, ARGMOD(PMC *filehandle), s->encoding = Parrot_utf8_encoding_ptr; /* count chars, verify utf8 */ - Parrot_utf8_encoding_ptr->iter_init(interp, s, &iter); + STRING_ITER_INIT(interp, &iter); while (iter.bytepos < s->bufused) { if (iter.bytepos + 4 > s->bufused) { @@ -92,7 +92,7 @@ Parrot_io_read_utf8(PARROT_INTERP, ARGMOD(PMC *filehandle), } } ok: - iter.get_and_advance(interp, &iter); + Parrot_utf8_encoding_ptr->iter_get_and_advance(interp, *buf, &iter); } s->strlen = iter.charpos; return len; diff --git a/src/pmc/stringiterator.pmc b/src/pmc/stringiterator.pmc index af58972..986dc8c 100644 --- a/src/pmc/stringiterator.pmc +++ b/src/pmc/stringiterator.pmc @@ -23,11 +23,10 @@ Implementation of Iterator for String PMC. pmclass StringIterator auto_attrs extends Iterator { - ATTR PMC *string; /* String to iterate over */ - ATTR INTVAL pos; /* Current position of iterator for forward iterator */ - /* Previous position of iterator for reverse iterator */ - ATTR INTVAL length; /* Length of C */ - ATTR INTVAL reverse; /* Direction of iteration. 1 - for reverse iteration */ + ATTR PMC *string; /* String PMC to iterate over */ + ATTR STRING *str_val; /* The actual string */ + ATTR String_iter iter; /* String iterator */ + ATTR INTVAL reverse; /* Direction of iteration. 1 - for reverse iteration */ /* @@ -39,7 +38,13 @@ Initialize StringIterator. */ VTABLE void init_pmc(PMC *string) { + Parrot_StringIterator_attributes * const attrs = + PARROT_STRINGITERATOR(SELF); + STRING * const str_val = VTABLE_get_string(INTERP, string); + SET_ATTR_string(INTERP, SELF, string); + SET_ATTR_str_val(INTERP, SELF, str_val); + STRING_ITER_INIT(INTERP, &attrs->iter); /* by default, iterate from start */ SELF.set_integer_native(ITERATE_FROM_START); @@ -58,8 +63,12 @@ Marks the current idx/key and the aggregate as live. VTABLE void mark() { PMC *string; + STRING *str_val; + GET_ATTR_string(INTERP, SELF, string); Parrot_gc_mark_PMC_alive(INTERP, string); + GET_ATTR_str_val(INTERP, SELF, str_val); + Parrot_gc_mark_STRING_alive(INTERP, str_val); } /* @@ -77,7 +86,8 @@ Marks the current idx/key and the aggregate as live. Parrot_StringIterator_attributes * const clone_attrs = PARROT_STRINGITERATOR(clone); - clone_attrs->pos = attrs->pos; + /* TODO: this isn't safe if the string PMC has changed */ + clone_attrs->iter = attrs->iter; clone_attrs->reverse = attrs->reverse; return clone; } @@ -110,9 +120,9 @@ Returns the number of remaining elements in the C. Parrot_StringIterator_attributes * const attrs = PARROT_STRINGITERATOR(SELF); if (attrs->reverse) - return attrs->pos; + return attrs->iter.charpos; else - return attrs->length - attrs->pos; + return attrs->str_val->strlen - attrs->iter.charpos; } VTABLE INTVAL get_integer() { @@ -137,13 +147,11 @@ Reset the Iterator. C must be one of PARROT_STRINGITERATOR(SELF); if (value == ITERATE_FROM_START) { attrs->reverse = 0; - attrs->pos = 0; - attrs->length = VTABLE_elements(INTERP, attrs->string); + STRING_ITER_SET_POSITION(INTERP, attrs->str_val, &attrs->iter, 0); } else if (value == ITERATE_FROM_END) { attrs->reverse = 1; - attrs->pos = attrs->length - = VTABLE_elements(INTERP, attrs->string); + STRING_ITER_SET_POSITION(INTERP, attrs->str_val, &attrs->iter, attrs->str_val->strlen); } else Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_INVALID_OPERATION, @@ -179,14 +187,17 @@ Shift next character from C as PMC. Parrot_StringIterator_attributes * const attrs = PARROT_STRINGITERATOR(SELF); PMC *ret; + STRING *str; + const String_iter old_iter = attrs->iter; - if (attrs->pos >= attrs->length) + if (attrs->iter.charpos >= attrs->str_val->strlen) Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, "StopIteration"); ret = Parrot_pmc_new(INTERP, Parrot_get_ctx_HLL_type(interp, enum_class_String)); - VTABLE_set_string_native(INTERP, ret, - VTABLE_get_string_keyed_int(INTERP, attrs->string, attrs->pos++)); + STRING_ITER_SKIP(INTERP, attrs->str_val, &attrs->iter, 1); + str = Parrot_str_iter_substr(INTERP, attrs->str_val, &old_iter, &attrs->iter); + VTABLE_set_string_native(INTERP, ret, str); return ret; } @@ -202,12 +213,14 @@ Shift next character from C. VTABLE STRING *shift_string() { Parrot_StringIterator_attributes * const attrs = PARROT_STRINGITERATOR(SELF); + const String_iter old_iter = attrs->iter; - if (attrs->pos >= attrs->length) + if (attrs->iter.charpos >= attrs->str_val->strlen) Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, "StopIteration"); - return VTABLE_get_string_keyed_int(INTERP, attrs->string, attrs->pos++); + STRING_ITER_SKIP(INTERP, attrs->str_val, &attrs->iter, 1); + return Parrot_str_iter_substr(INTERP, attrs->str_val, &old_iter, &attrs->iter); } /* @@ -223,11 +236,11 @@ Shift next character code from C. Parrot_StringIterator_attributes * const attrs = PARROT_STRINGITERATOR(SELF); - if (attrs->pos >= attrs->length) + if (attrs->iter.charpos >= attrs->str_val->strlen) Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, "StopIteration"); - return VTABLE_get_integer_keyed_int(INTERP, attrs->string, attrs->pos++); + return STRING_ITER_GET_AND_ADVANCE(INTERP, attrs->str_val, &attrs->iter); } /* @@ -243,14 +256,17 @@ Shift "next" character from C for reverse iterator as PMC. Parrot_StringIterator_attributes * const attrs = PARROT_STRINGITERATOR(SELF); PMC *ret; + STRING * str; + const String_iter old_iter = attrs->iter; - if (!STATICSELF.get_bool()) + if (attrs->iter.charpos <= 0) Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, "StopIteration"); ret = Parrot_pmc_new(INTERP, Parrot_get_ctx_HLL_type(interp, enum_class_String)); - VTABLE_set_string_native(INTERP, ret, - VTABLE_get_string_keyed_int(INTERP, attrs->string, --attrs->pos)); + STRING_ITER_SKIP(INTERP, attrs->str_val, &attrs->iter, -1); + str = Parrot_str_iter_substr(INTERP, attrs->str_val, &attrs->iter, &old_iter); + VTABLE_set_string_native(INTERP, ret, str); return ret; } @@ -266,12 +282,14 @@ Shift "next" character from C for reverse iterator. VTABLE STRING *pop_string() { Parrot_StringIterator_attributes * const attrs = PARROT_STRINGITERATOR(SELF); + const String_iter old_iter = attrs->iter; - if (!STATICSELF.get_bool()) + if (attrs->iter.charpos <= 0) Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, "StopIteration"); - return VTABLE_get_string_keyed_int(INTERP, attrs->string, --attrs->pos); + STRING_ITER_SKIP(INTERP, attrs->str_val, &attrs->iter, -1); + return Parrot_str_iter_substr(INTERP, attrs->str_val, &attrs->iter, &old_iter); } /* @@ -287,11 +305,12 @@ Shift "next" character code from C for reverse iterator. Parrot_StringIterator_attributes * const attrs = PARROT_STRINGITERATOR(SELF); - if (!STATICSELF.get_bool()) + if (attrs->iter.charpos <= 0) Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, "StopIteration"); - return VTABLE_get_integer_keyed_int(INTERP, attrs->string, --attrs->pos); + STRING_ITER_SKIP(INTERP, attrs->str_val, &attrs->iter, -1); + return STRING_ITER_GET(INTERP, attrs->str_val, &attrs->iter, 0); } /* @@ -305,8 +324,15 @@ Get integer value of current position plus idx. */ VTABLE INTVAL get_integer_keyed_int(INTVAL idx) { - return VTABLE_get_integer_keyed_int(INTERP, STATICSELF.get_pmc(), - PARROT_STRINGITERATOR(SELF)->pos + idx); + Parrot_StringIterator_attributes * const attrs = + PARROT_STRINGITERATOR(SELF); + const UINTVAL offset = attrs->iter.charpos + idx; + + if (offset >= attrs->str_val->strlen) + Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, + "StopIteration"); + + return STRING_ITER_GET(INTERP, attrs->str_val, &attrs->iter, idx); } /* @@ -320,8 +346,22 @@ Get string value of current position plus idx. */ VTABLE STRING *get_string_keyed_int(INTVAL idx) { - return VTABLE_get_string_keyed_int(INTERP, STATICSELF.get_pmc(), - PARROT_STRINGITERATOR(SELF)->pos + idx); + Parrot_StringIterator_attributes * const attrs = + PARROT_STRINGITERATOR(SELF); + const UINTVAL offset = attrs->iter.charpos + idx; + String_iter iter, next_iter; + + if (offset >= attrs->str_val->strlen) + Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, + "StopIteration"); + + iter = attrs->iter; + if (idx != 0) + STRING_ITER_SKIP(INTERP, attrs->str_val, &iter, idx); + next_iter = iter; + STRING_ITER_SKIP(INTERP, attrs->str_val, &next_iter, 1); + + return Parrot_str_iter_substr(INTERP, attrs->str_val, &iter, &next_iter); } } diff --git a/src/string/api.c b/src/string/api.c index 1de008e..a73393a 100644 --- a/src/string/api.c +++ b/src/string/api.c @@ -1289,6 +1289,119 @@ Parrot_str_substr(PARROT_INTERP, } } +/* + +=item C + +Returns the substring between iterators C and C. + +=cut + +*/ + +PARROT_EXPORT +PARROT_CANNOT_RETURN_NULL +PARROT_WARN_UNUSED_RESULT +STRING * +Parrot_str_iter_substr(PARROT_INTERP, + ARGMOD(STRING *str), + ARGIN(const String_iter *l), ARGIN_NULLOK(const String_iter *r)) +{ + ASSERT_ARGS(Parrot_str_iter_substr) + STRING *dest = Parrot_str_new_COW(interp, str); + + dest->strstart = (char *)dest->strstart + l->bytepos; + + if (r == NULL) { + dest->bufused = str->bufused - l->bytepos; + dest->strlen = str->strlen - l->charpos; + } + else { + dest->bufused = r->bytepos - l->bytepos; + dest->strlen = r->charpos - l->charpos; + } + + dest->hashval = 0; + + return dest; +} + +/* + +=item C + +Find the next occurence of STRING C in STRING C starting at +String_iter C. If C is found C is modified to mark the +beginning of C and String_iter C is set to the character after +C in C. Returns the character position where C was found +or -1 if it wasn't found. + +=cut + +*/ + +PARROT_EXPORT +INTVAL +Parrot_str_iter_index(PARROT_INTERP, + ARGIN(const STRING *src), + ARGMOD(String_iter *start), ARGMOD(String_iter *end), + ARGIN(const STRING *search)) +{ + ASSERT_ARGS(Parrot_str_iter_index) + String_iter search_iter; + const UINTVAL len = search->strlen; + + *end = *start; + + if (len == 0) { + return start->charpos; + } + + STRING_ITER_INIT(interp, &search_iter); + + if (len == 1) { + const UINTVAL c0 = STRING_ITER_GET(interp, search, &search_iter, 0); + + while (start->charpos < src->strlen) { + const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, end); + if (c == c0) + return start->charpos; + *start = *end; + } + } + else { + const UINTVAL c0 = STRING_ITER_GET_AND_ADVANCE(interp, search, &search_iter); + String_iter search_start = search_iter; + + while (1) { + String_iter src_start_iter; + UINTVAL c1, c2; + + do { + *start = *end; + if (start->charpos + len > src->strlen) + return -1; + c1 = STRING_ITER_GET_AND_ADVANCE(interp, src, end); + } while (c1 != c0); + + do { + if (search_iter.charpos >= len) + return start->charpos; + c1 = STRING_ITER_GET_AND_ADVANCE(interp, src, end); + c2 = STRING_ITER_GET_AND_ADVANCE(interp, search, &search_iter); + } while (c1 == c2); + + STRING_ITER_SKIP(interp, src, start, 1); + *end = *start; + search_iter = search_start; + } + } + + return -1; +} + /* @@ -1383,12 +1496,12 @@ Parrot_str_replace(PARROT_INTERP, ARGIN(STRING *src), } /* get byte position of the part that will be replaced */ - ENCODING_ITER_INIT(interp, src, &iter); + STRING_ITER_INIT(interp, &iter); - iter.set_position(interp, &iter, true_offset); + STRING_ITER_SET_POSITION(interp, src, &iter, true_offset); start_byte = iter.bytepos; - iter.set_position(interp, &iter, true_offset + true_length); + STRING_ITER_SET_POSITION(interp, src, &iter, true_offset + true_length); end_byte = iter.bytepos; /* not possible.... */ @@ -1486,7 +1599,7 @@ void Parrot_str_chopn_inplace(PARROT_INTERP, ARGMOD(STRING *s), INTVAL n) { ASSERT_ARGS(Parrot_str_chopn_inplace) - UINTVAL new_length, uchar_size; + UINTVAL new_length; if (n < 0) { new_length = -n; @@ -1507,23 +1620,23 @@ Parrot_str_chopn_inplace(PARROT_INTERP, ARGMOD(STRING *s), INTVAL n) return; } - uchar_size = s->bufused / s->strlen; - s->strlen = new_length; - if (s->encoding == Parrot_fixed_8_encoding_ptr) { s->bufused = new_length; } else if (s->encoding == Parrot_ucs2_encoding_ptr) { + const UINTVAL uchar_size = s->bufused / s->strlen; s->bufused = new_length * uchar_size; } else { String_iter iter; - ENCODING_ITER_INIT(interp, s, &iter); - iter.set_position(interp, &iter, new_length); + STRING_ITER_INIT(interp, &iter); + STRING_ITER_SET_POSITION(interp, s, &iter, new_length); s->bufused = iter.bytepos; } + s->strlen = new_length; + return; } @@ -2159,13 +2272,12 @@ Parrot_str_to_int(PARROT_INTERP, ARGIN_NULLOK(const STRING *s)) int sign = 1; INTVAL i = 0; String_iter iter; - UINTVAL offs; number_parse_state state = parse_start; - ENCODING_ITER_INIT(interp, s, &iter); + STRING_ITER_INIT(interp, &iter); - for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) { - const UINTVAL c = iter.get_and_advance(interp, &iter); + while (state != parse_end && iter.charpos < s->strlen) { + const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, s, &iter); /* Check for overflow */ if (c > 255) break; @@ -2250,17 +2362,16 @@ Parrot_str_to_num(PARROT_INTERP, ARGIN(const STRING *s)) int d_length = 0; int check_nan = 0; /* Check for NaN and Inf after main loop */ String_iter iter; - UINTVAL offs; number_parse_state state = parse_start; if (!s) return 0.0; - ENCODING_ITER_INIT(interp, s, &iter); + STRING_ITER_INIT(interp, &iter); /* Handcrafter FSM to read float value */ - for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) { - const UINTVAL c = iter.get_and_advance(interp, &iter); + while (state != parse_end && iter.charpos < s->strlen) { + const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, s, &iter); /* Check for overflow */ if (c > 255) break; @@ -2635,7 +2746,6 @@ Parrot_str_to_hashval(PARROT_INTERP, ARGMOD_NULLOK(STRING *s)) { ASSERT_ARGS(Parrot_str_to_hashval) String_iter iter; - UINTVAL offs; size_t hashval = interp->hash_seed; if (!s) @@ -2644,10 +2754,10 @@ Parrot_str_to_hashval(PARROT_INTERP, ARGMOD_NULLOK(STRING *s)) /* ZZZZZ workaround for something not setting up encodings right */ saneify_string(s); - ENCODING_ITER_INIT(interp, s, &iter); + STRING_ITER_INIT(interp, &iter); - for (offs = 0; offs < s->strlen; ++offs) { - const UINTVAL c = iter.get_and_advance(interp, &iter); + while (iter.charpos < s->strlen) { + const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, s, &iter); hashval += hashval << 5; hashval += c; } @@ -2725,11 +2835,11 @@ Parrot_str_escape_truncate(PARROT_INTERP, Parrot_fixed_8_encoding_ptr, Parrot_ascii_charset_ptr, 0); /* more work TODO */ - ENCODING_ITER_INIT(interp, src, &iter); + STRING_ITER_INIT(interp, &iter); dp = (unsigned char *)result->strstart; for (i = 0; len > 0; --len) { - UINTVAL c = iter.get_and_advance(interp, &iter); + UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); if (c < 0x7f) { /* process ASCII chars */ if (i >= charlen - 2) { @@ -2883,7 +2993,7 @@ Parrot_str_unescape(PARROT_INTERP, encoding = result->encoding; } - encoding->iter_init(interp, result, &iter); + STRING_ITER_INIT(interp, &iter); for (offs = d = 0; offs < clength; ++offs) { r = (Parrot_UInt4)((unsigned char *)result->strstart)[offs]; @@ -2906,7 +3016,7 @@ Parrot_str_unescape(PARROT_INTERP, } PARROT_ASSERT(d < offs); - iter.set_and_advance(interp, &iter, r); + encoding->iter_set_and_advance(interp, result, &iter, r); ++d; } @@ -3441,8 +3551,10 @@ Parrot_str_split(PARROT_INTERP, ARGIN_NULLOK(STRING *delim), ARGIN_NULLOK(STRING *str)) { ASSERT_ARGS(Parrot_str_split) - PMC *res; - INTVAL slen, dlen, ps, pe; + PMC *res; + STRING *tstr; + UINTVAL slen, dlen; + String_iter iter; if (STRING_IS_NULL(delim) || STRING_IS_NULL(str)) return PMCNULL; @@ -3453,44 +3565,38 @@ Parrot_str_split(PARROT_INTERP, if (!slen) return res; + STRING_ITER_INIT(interp, &iter); dlen = Parrot_str_byte_length(interp, delim); if (dlen == 0) { - int i; VTABLE_set_integer_native(interp, res, slen); - for (i = 0; i < slen; ++i) { - STRING * const p = Parrot_str_substr(interp, str, i, 1, NULL, 0); - VTABLE_set_string_keyed_int(interp, res, i, p); - } - - return res; - } + do { + const String_iter old_iter = iter; - pe = Parrot_str_find_index(interp, str, delim, 0); + STRING_ITER_SKIP(interp, str, &iter, 1); + tstr = Parrot_str_iter_substr(interp, str, &old_iter, &iter); + VTABLE_set_string_keyed_int(interp, res, old_iter.charpos, tstr); + } while (iter.charpos < slen); - if (pe < 0) { - VTABLE_push_string(interp, res, str); return res; } - ps = 0; - - while (ps <= slen) { - const int pl = pe - ps; - STRING * const tstr = Parrot_str_substr(interp, str, ps, pl, NULL, 0); - - VTABLE_push_string(interp, res, tstr); - ps = pe + Parrot_str_byte_length(interp, delim); + do { + String_iter start, end; + INTVAL pos; - if (ps > slen) + start = iter; + if (Parrot_str_iter_index(interp, str, &start, &end, delim) < 0) break; - pe = Parrot_str_find_index(interp, str, delim, ps); + tstr = Parrot_str_iter_substr(interp, str, &iter, &start); + VTABLE_push_string(interp, res, tstr); + iter = end; + } while (iter.charpos < slen); - if (pe < 0) - pe = slen; - } + tstr = Parrot_str_iter_substr(interp, str, &iter, NULL); + VTABLE_push_string(interp, res, tstr); return res; } diff --git a/src/string/charset/ascii.c b/src/string/charset/ascii.c index 1cb0f23..f5ba605 100644 --- a/src/string/charset/ascii.c +++ b/src/string/charset/ascii.c @@ -263,7 +263,6 @@ to_ascii(PARROT_INTERP, ARGIN(STRING *src), ARGMOD_NULLOK(STRING *dest)) { ASSERT_ARGS(to_ascii) String_iter iter; - UINTVAL offs; unsigned char *p; const UINTVAL len = src->strlen; @@ -275,9 +274,9 @@ to_ascii(PARROT_INTERP, ARGIN(STRING *src), ARGMOD_NULLOK(STRING *dest)) dest = src; } p = (unsigned char *)dest->strstart; - ENCODING_ITER_INIT(interp, src, &iter); - for (offs = 0; offs < len; ++offs) { - const UINTVAL c = iter.get_and_advance(interp, &iter); + STRING_ITER_INIT(interp, &iter); + while (iter.charpos < len) { + const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); if (c >= 128) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION, "can't convert unicode string to ascii"); @@ -557,11 +556,10 @@ ascii_compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs)) return ret_val < 0 ? -1 : 1; } else { - UINTVAL offs; - ENCODING_ITER_INIT(interp, rhs, &iter); - for (offs = 0; offs < min_len; ++offs) { - const UINTVAL cl = ENCODING_GET_BYTE(interp, lhs, offs); - const UINTVAL cr = iter.get_and_advance(interp, &iter); + STRING_ITER_INIT(interp, &iter); + while (iter.charpos < min_len) { + const UINTVAL cl = ENCODING_GET_BYTE(interp, lhs, iter.charpos); + const UINTVAL cr = STRING_ITER_GET_AND_ADVANCE(interp, rhs, &iter); if (cl != cr) return cl < cr ? -1 : 1; } @@ -595,44 +593,12 @@ mixed_cs_index(PARROT_INTERP, ARGIN(STRING *src), ARGIN(STRING *search), UINTVAL offs) { ASSERT_ARGS(mixed_cs_index) - String_iter src_iter, search_iter; - UINTVAL len, next_pos; - INTVAL found_at; - - ENCODING_ITER_INIT(interp, src, &src_iter); - src_iter.set_position(interp, &src_iter, offs); - ENCODING_ITER_INIT(interp, search, &search_iter); - len = search->strlen; - - found_at = -1; - next_pos = offs; - - for (; len && offs < src->strlen ;) { - const UINTVAL c1 = src_iter.get_and_advance(interp, &src_iter); - const UINTVAL c2 = search_iter.get_and_advance(interp, &search_iter); - - if (c1 == c2) { - --len; - if (found_at == -1) - found_at = offs; - ++offs; - } - else { - len = search->strlen; - ++offs; - ++next_pos; - if (offs != next_pos) { - src_iter.set_position(interp, &src_iter, next_pos); - offs = next_pos; - } - - found_at = -1; - search_iter.set_position(interp, &search_iter, 0); - } - } - if (len == 0) - return found_at; - return -1; + String_iter start, end; + + STRING_ITER_INIT(interp, &start); + STRING_ITER_SET_POSITION(interp, src, &start, offs); + + return Parrot_str_iter_index(interp, src, &start, &end, search); } /* @@ -711,12 +677,12 @@ static UINTVAL validate(PARROT_INTERP, ARGIN(STRING *src)) { ASSERT_ARGS(validate) - UINTVAL offset; + const UINTVAL len = Parrot_str_byte_length(interp, src); String_iter iter; - ENCODING_ITER_INIT(interp, src, &iter); - for (offset = 0; offset < Parrot_str_byte_length(interp, src); ++offset) { - const UINTVAL codepoint = iter.get_and_advance(interp, &iter); + STRING_ITER_INIT(interp, &iter); + while (iter.charpos < len) { + const UINTVAL codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); if (codepoint >= 0x80) return 0; } diff --git a/src/string/charset/iso-8859-1.c b/src/string/charset/iso-8859-1.c index b88c11d..65c663a 100644 --- a/src/string/charset/iso-8859-1.c +++ b/src/string/charset/iso-8859-1.c @@ -215,10 +215,10 @@ static STRING * to_iso_8859_1(PARROT_INTERP, ARGIN(STRING *src), ARGMOD_NULLOK(STRING *dest)) { ASSERT_ARGS(to_iso_8859_1) - UINTVAL offs, src_len; + UINTVAL src_len; String_iter iter; - ENCODING_ITER_INIT(interp, src, &iter); + STRING_ITER_INIT(interp, &iter); src_len = src->strlen; if (dest) { Parrot_gc_reallocate_string_storage(interp, dest, src_len); @@ -229,16 +229,16 @@ to_iso_8859_1(PARROT_INTERP, ARGIN(STRING *src), ARGMOD_NULLOK(STRING *dest)) dest = src; } dest->bufused = src_len; - dest->charset = Parrot_iso_8859_1_charset_ptr; - dest->encoding = Parrot_fixed_8_encoding_ptr; - for (offs = 0; offs < src_len; ++offs) { - const UINTVAL c = iter.get_and_advance(interp, &iter); + while (iter.charpos < src_len) { + const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); if (c >= 0x100) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION, "lossy conversion to iso-8559-1"); - ENCODING_SET_BYTE(interp, dest, offs, c); + Parrot_fixed_8_encoding_ptr->set_byte(interp, dest, iter.charpos - 1, c); } + dest->charset = Parrot_iso_8859_1_charset_ptr; + dest->encoding = Parrot_fixed_8_encoding_ptr; return dest; } @@ -258,24 +258,23 @@ to_unicode(PARROT_INTERP, ARGIN(STRING *src), ARGMOD_NULLOK(STRING *dest)) { ASSERT_ARGS(to_unicode) if (dest) { - UINTVAL offs; String_iter iter; dest->charset = Parrot_unicode_charset_ptr; dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interp, dest); Parrot_gc_reallocate_string_storage(interp, dest, src->strlen); - ENCODING_ITER_INIT(interp, dest, &iter); - for (offs = 0; offs < src->strlen; ++offs) { - const UINTVAL c = ENCODING_GET_BYTE(interp, src, offs); + STRING_ITER_INIT(interp, &iter); + while (iter.charpos < src->strlen) { + const UINTVAL c = ENCODING_GET_BYTE(interp, src, iter.charpos); if (iter.bytepos >= Buffer_buflen(dest) - 4) { - UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5); + UINTVAL need = (UINTVAL)((src->strlen - iter.charpos) * 1.5); if (need < 16) need = 16; Parrot_gc_reallocate_string_storage(interp, dest, Buffer_buflen(dest) + need); } - iter.set_and_advance(interp, &iter, c); + STRING_ITER_SET_AND_ADVANCE(interp, dest, &iter, c); } dest->bufused = iter.bytepos; dest->strlen = iter.charpos; diff --git a/src/string/charset/unicode.c b/src/string/charset/unicode.c index 77b0893..98f6e84 100644 --- a/src/string/charset/unicode.c +++ b/src/string/charset/unicode.c @@ -704,20 +704,20 @@ compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs)) { ASSERT_ARGS(compare) String_iter l_iter, r_iter; - UINTVAL offs, cl, cr, min_len, l_len, r_len; + UINTVAL min_len, l_len, r_len; /* TODO make optimized equal - strings are equal length then already */ - ENCODING_ITER_INIT(interp, lhs, &l_iter); - ENCODING_ITER_INIT(interp, rhs, &r_iter); + STRING_ITER_INIT(interp, &l_iter); + STRING_ITER_INIT(interp, &r_iter); l_len = lhs->strlen; r_len = rhs->strlen; min_len = l_len > r_len ? r_len : l_len; - for (offs = 0; offs < min_len; ++offs) { - cl = l_iter.get_and_advance(interp, &l_iter); - cr = r_iter.get_and_advance(interp, &r_iter); + while (l_iter.charpos < min_len) { + UINTVAL cl = STRING_ITER_GET_AND_ADVANCE(interp, lhs, &l_iter); + UINTVAL cr = STRING_ITER_GET_AND_ADVANCE(interp, rhs, &r_iter); if (cl != cr) return cl < cr ? -1 : 1; @@ -769,12 +769,12 @@ static UINTVAL validate(PARROT_INTERP, ARGIN(STRING *src)) { ASSERT_ARGS(validate) - UINTVAL offset; + UINTVAL len = Parrot_str_byte_length(interp, src); String_iter iter; - ENCODING_ITER_INIT(interp, src, &iter); - for (offset = 0; offset < Parrot_str_byte_length(interp, src); ++offset) { - const UINTVAL codepoint = iter.get_and_advance(interp, &iter); + STRING_ITER_INIT(interp, &iter); + while (iter.charpos < len) { + const UINTVAL codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); /* Check for Unicode non-characters */ if (codepoint >= 0xfdd0 && (codepoint <= 0xfdef || (codepoint & 0xfffe) == 0xfffe) @@ -924,24 +924,22 @@ find_cclass(PARROT_INTERP, INTVAL flags, ASSERT_ARGS(find_cclass) String_iter iter; UINTVAL codepoint; - UINTVAL pos = offset; UINTVAL end = offset + count; - ENCODING_ITER_INIT(interp, source_string, &iter); - - iter.set_position(interp, &iter, pos); + STRING_ITER_INIT(interp, &iter); + STRING_ITER_SET_POSITION(interp, source_string, &iter, offset); end = source_string->strlen < end ? source_string->strlen : end; - for (; pos < end; ++pos) { - codepoint = iter.get_and_advance(interp, &iter); + while (iter.charpos < end) { + codepoint = STRING_ITER_GET_AND_ADVANCE(interp, source_string, &iter); if (codepoint >= 256) { if (u_iscclass(interp, codepoint, flags)) - return pos; + return iter.charpos - 1; } else { if (Parrot_iso_8859_1_typetable[codepoint] & flags) - return pos; + return iter.charpos - 1; } } @@ -965,37 +963,36 @@ find_not_cclass(PARROT_INTERP, INTVAL flags, ASSERT_ARGS(find_not_cclass) String_iter iter; UINTVAL codepoint; - UINTVAL pos = offset; UINTVAL end = offset + count; int bit; - if (pos > source_string->strlen) { + if (offset > source_string->strlen) { /* XXX: Throw in this case? */ return offset + count; } - ENCODING_ITER_INIT(interp, source_string, &iter); + STRING_ITER_INIT(interp, &iter); - if (pos) - iter.set_position(interp, &iter, pos); + if (offset) + STRING_ITER_SET_POSITION(interp, source_string, &iter, offset); end = source_string->strlen < end ? source_string->strlen : end; if (flags == enum_cclass_any) return end; - for (; pos < end; ++pos) { - codepoint = iter.get_and_advance(interp, &iter); + while (iter.charpos < end) { + codepoint = STRING_ITER_GET_AND_ADVANCE(interp, source_string, &iter); if (codepoint >= 256) { for (bit = enum_cclass_uppercase; bit <= enum_cclass_word ; bit <<= 1) { if ((bit & flags) && !u_iscclass(interp, codepoint, bit)) - return pos; + return iter.charpos - 1; } } else { if (!(Parrot_iso_8859_1_typetable[codepoint] & flags)) - return pos; + return iter.charpos - 1; } } @@ -1023,8 +1020,8 @@ string_from_codepoint(PARROT_INTERP, UINTVAL codepoint) dest->strlen = 1; - ENCODING_ITER_INIT(interp, dest, &iter); - iter.set_and_advance(interp, &iter, codepoint); + STRING_ITER_INIT(interp, &iter); + STRING_ITER_SET_AND_ADVANCE(interp, dest, &iter, codepoint); dest->bufused = iter.bytepos; return dest; @@ -1047,13 +1044,12 @@ compute_hash(PARROT_INTERP, ARGIN(const STRING *src), size_t seed) { ASSERT_ARGS(compute_hash) String_iter iter; - UINTVAL offs; size_t hashval = seed; - ENCODING_ITER_INIT(interp, src, &iter); + STRING_ITER_INIT(interp, &iter); - for (offs = 0; offs < src->strlen; ++offs) { - const UINTVAL c = iter.get_and_advance(interp, &iter); + while (iter.charpos < src->strlen) { + const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); hashval += hashval << 5; hashval += c; } diff --git a/src/string/encoding/fixed_8.c b/src/string/encoding/fixed_8.c index dd41129..712479d 100644 --- a/src/string/encoding/fixed_8.c +++ b/src/string/encoding/fixed_8.c @@ -50,6 +50,48 @@ static UINTVAL fixed8_get_next(PARROT_INTERP, ARGMOD(String_iter *iter)) __attribute__nonnull__(2) FUNC_MODIFIES(*iter); +static UINTVAL fixed8_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), + ARGIN(const String_iter *iter), + INTVAL offset) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3); + +static UINTVAL fixed8_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *iter)) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*iter); + +static void fixed8_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), + ARGMOD(String_iter *iter), + UINTVAL c) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*str) + FUNC_MODIFIES(*iter); + +static void fixed8_iter_set_position(SHIM_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *iter), + UINTVAL pos) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*iter); + +static void fixed8_iter_skip(SHIM_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *iter), + INTVAL skip) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*iter); + static void fixed8_set_next(PARROT_INTERP, ARGMOD(String_iter *iter), UINTVAL c) @@ -181,6 +223,24 @@ static STRING * to_encoding(PARROT_INTERP, #define ASSERT_ARGS_fixed8_get_next __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(iter)) +#define ASSERT_ARGS_fixed8_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(iter)) +#define ASSERT_ARGS_fixed8_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(iter)) +#define ASSERT_ARGS_fixed8_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(iter)) +#define ASSERT_ARGS_fixed8_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(iter)) +#define ASSERT_ARGS_fixed8_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(iter)) #define ASSERT_ARGS_fixed8_set_next __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(iter)) @@ -581,6 +641,108 @@ bytes(SHIM_INTERP, ARGIN(STRING *source_string)) /* +=item C + +Get the character at C plus C. + +=cut + +*/ + +static UINTVAL +fixed8_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), ARGIN(const String_iter *iter), INTVAL offset) +{ + ASSERT_ARGS(fixed8_iter_get) + return get_byte(interp, str, iter->charpos + offset); +} + +/* + +=item C + +Moves the string iterator C by C characters. + +=cut + +*/ + +static void +fixed8_iter_skip(SHIM_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *iter), INTVAL skip) +{ + ASSERT_ARGS(fixed8_iter_skip) + iter->bytepos += skip; + iter->charpos += skip; + PARROT_ASSERT(iter->bytepos <= Buffer_buflen(str)); +} + +/* + +=item C + +Moves the string iterator C to the next codepoint. + +=cut + +*/ + +static UINTVAL +fixed8_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *iter)) +{ + ASSERT_ARGS(fixed8_iter_get_and_advance) + const UINTVAL c = get_byte(interp, str, iter->charpos++); + iter->bytepos++; + return c; +} + +/* + +=item C + +With the string iterator C, appends the codepoint C and advances to the +next position in the string. + +=cut + +*/ + +static void +fixed8_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), ARGMOD(String_iter *iter), UINTVAL c) +{ + ASSERT_ARGS(fixed8_iter_set_and_advance) + set_byte(interp, str, iter->charpos++, c); + iter->bytepos++; +} + +/* + +=item C + +Moves the string iterator C to the position C in the string. + +=cut + +*/ + +static void +fixed8_iter_set_position(SHIM_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *iter), UINTVAL pos) +{ + ASSERT_ARGS(fixed8_iter_set_position) + iter->bytepos = iter->charpos = pos; + PARROT_ASSERT(pos <= Buffer_buflen(str)); +} + +/* + =item C Moves the string iterator C to the next codepoint. @@ -695,7 +857,12 @@ Parrot_encoding_fixed_8_init(PARROT_INTERP) codepoints, bytes, iter_init, - find_cclass + find_cclass, + fixed8_iter_get, + fixed8_iter_skip, + fixed8_iter_get_and_advance, + fixed8_iter_set_and_advance, + fixed8_iter_set_position }; STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding); diff --git a/src/string/encoding/ucs2.c b/src/string/encoding/ucs2.c index 71ef8b1..6a7459c 100644 --- a/src/string/encoding/ucs2.c +++ b/src/string/encoding/ucs2.c @@ -164,6 +164,48 @@ static void ucs2_encode_and_advance(PARROT_INTERP, __attribute__nonnull__(2) FUNC_MODIFIES(*i); +static UINTVAL ucs2_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), + ARGIN(const String_iter *i), + INTVAL offset) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3); + +static UINTVAL ucs2_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i)) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void ucs2_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), + ARGMOD(String_iter *i), + UINTVAL c) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*str) + FUNC_MODIFIES(*i); + +static void ucs2_iter_set_position(SHIM_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + UINTVAL n) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void ucs2_iter_skip(SHIM_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + INTVAL skip) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + static void ucs2_set_position(SHIM_INTERP, ARGMOD(String_iter *i), UINTVAL n) @@ -219,6 +261,24 @@ static void ucs2_set_position(SHIM_INTERP, #define ASSERT_ARGS_ucs2_encode_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_ucs2_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_ucs2_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_ucs2_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_ucs2_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_ucs2_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) #define ASSERT_ARGS_ucs2_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(i)) /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */ @@ -397,11 +457,11 @@ get_codepoints(PARROT_INTERP, ARGIN(STRING *src), UINTVAL offset, UINTVAL count) String_iter iter; UINTVAL start; - iter_init(interp, src, &iter); - iter.set_position(interp, &iter, offset); + STRING_ITER_INIT(interp, &iter); + ucs2_iter_set_position(interp, src, &iter, offset); start = iter.bytepos; return_string->strstart = (char *)return_string->strstart + start; - iter.set_position(interp, &iter, offset + count); + ucs2_iter_set_position(interp, src, &iter, offset + count); return_string->bufused = iter.bytepos - start; } #endif @@ -576,6 +636,150 @@ bytes(PARROT_INTERP, ARGIN(STRING *src)) /* +=item C + +Get the character at C + C. + +=cut + +*/ + +static UINTVAL +ucs2_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset) +{ + ASSERT_ARGS(ucs2_iter_get) + return get_codepoint(interp, str, i->charpos + offset); +} + +/* + +=item C + +Moves the string iterator C by C characters. + +=cut + +*/ + +static void +ucs2_iter_skip(SHIM_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip) +{ + ASSERT_ARGS(ucs2_iter_skip) + +#if PARROT_HAS_ICU + i->charpos += skip; + i->bytepos += skip * sizeof (UChar); +#else + /* This function must never be called if compiled without ICU. + * See TT #557 + */ + PARROT_ASSERT(0); +#endif +} + +/* + +=item C + +Moves the string iterator C to the next UCS-2 codepoint. + +=cut + +*/ + +static UINTVAL +ucs2_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i)) +{ + ASSERT_ARGS(ucs2_iter_get_and_advance) + +#if PARROT_HAS_ICU + UChar * const s = (UChar*) str->strstart; + size_t pos = i->bytepos / sizeof (UChar); + + /* TODO either make sure that we don't go past end or use SAFE + * iter versions + */ + const UChar c = s[pos++]; + i->charpos++; + i->bytepos = pos * sizeof (UChar); + return c; +#else + /* This function must never be called if compiled without ICU. + * See TT #557 + */ + PARROT_ASSERT(0); + return (UINTVAL)0; /* Stop the static analyzers from panicing */ +#endif +} + +/* + +=item C + +With the string iterator C, appends the codepoint C and advances to the +next position in the string. + +=cut + +*/ + +static void +ucs2_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c) +{ + ASSERT_ARGS(ucs2_iter_set_and_advance) + +#if PARROT_HAS_ICU + UChar * const s = (UChar*) str->strstart; + UINTVAL pos = i->bytepos / sizeof (UChar); + s[pos++] = (UChar)c; + i->charpos++; + i->bytepos = pos * sizeof (UChar); +#else + /* This function must never be called if compiled without ICU. + * See TT #557 + */ + PARROT_ASSERT(0); +#endif +} + +/* + +=item C + +Moves the string iterator C to the position C in the string. + +=cut + +*/ + +static void +ucs2_iter_set_position(SHIM_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL n) +{ + ASSERT_ARGS(ucs2_iter_set_position) + +#if PARROT_HAS_ICU + i->charpos = n; + i->bytepos = n * sizeof (UChar); +#else + /* This function must never be called if compiled without ICU. + * See TT #557 + */ + PARROT_ASSERT(0); +#endif +} + +/* + =item C Moves the string iterator C to the next UCS-2 codepoint. @@ -729,7 +933,12 @@ Parrot_encoding_ucs2_init(PARROT_INTERP) codepoints, bytes, iter_init, - find_cclass + find_cclass, + ucs2_iter_get, + ucs2_iter_skip, + ucs2_iter_get_and_advance, + ucs2_iter_set_and_advance, + ucs2_iter_set_position }; STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding); Parrot_register_encoding(interp, "ucs2", return_encoding); diff --git a/src/string/encoding/utf16.c b/src/string/encoding/utf16.c index 6fa5bb4..f0749d9 100644 --- a/src/string/encoding/utf16.c +++ b/src/string/encoding/utf16.c @@ -161,6 +161,51 @@ static void utf16_encode_and_advance(PARROT_INTERP, __attribute__nonnull__(2) FUNC_MODIFIES(*i); +static UINTVAL utf16_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), + ARGIN(const String_iter *i), + INTVAL offset) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3); + +PARROT_WARN_UNUSED_RESULT +static UINTVAL utf16_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i)) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void utf16_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), + ARGMOD(String_iter *i), + UINTVAL c) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*str) + FUNC_MODIFIES(*i); + +static void utf16_iter_set_position(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + UINTVAL n) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void utf16_iter_skip(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + INTVAL skip) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + static void utf16_set_position(PARROT_INTERP, ARGMOD(String_iter *i), UINTVAL n) @@ -223,6 +268,26 @@ static void utf16_set_position(PARROT_INTERP, #define ASSERT_ARGS_utf16_encode_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf16_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf16_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf16_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf16_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf16_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) #define ASSERT_ARGS_utf16_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(i)) @@ -498,11 +563,11 @@ get_codepoints(PARROT_INTERP, ARGIN(STRING *src), UINTVAL offset, UINTVAL count) UINTVAL start; STRING * const return_string = Parrot_str_new_COW(interp, src); - iter_init(interp, src, &iter); - iter.set_position(interp, &iter, offset); + STRING_ITER_INIT(interp, &iter); + utf16_iter_set_position(interp, src, &iter, offset); start = iter.bytepos; return_string->strstart = (char *)return_string->strstart + start ; - iter.set_position(interp, &iter, offset + count); + utf16_iter_skip(interp, src, &iter, count); return_string->bufused = iter.bytepos - start; return_string->strlen = count; return_string->hashval = 0; @@ -532,11 +597,11 @@ get_codepoints_inplace(PARROT_INTERP, ARGIN(STRING *src), String_iter iter; UINTVAL start; Parrot_str_reuse_COW(interp, src, return_string); - iter_init(interp, src, &iter); - iter.set_position(interp, &iter, offset); + STRING_ITER_INIT(interp, &iter); + utf16_iter_set_position(interp, src, &iter, offset); start = iter.bytepos; return_string->strstart = (char *)return_string->strstart + start ; - iter.set_position(interp, &iter, offset + count); + utf16_iter_skip(interp, src, &iter, count); return_string->bufused = iter.bytepos - start; return_string->strlen = count; return_string->hashval = 0; @@ -675,15 +740,23 @@ static UINTVAL codepoints(PARROT_INTERP, ARGIN(STRING *src)) { ASSERT_ARGS(codepoints) - String_iter iter; +#if PARROT_HAS_ICU + UChar *s = (UChar*) src->strstart; + UINTVAL pos = 0; /* * this is used to initially calculate src->strlen, * therefore we must scan the whole string */ - iter_init(interp, src, &iter); - while (iter.bytepos < src->bufused) - iter.get_and_advance(interp, &iter); - return iter.charpos; + while (pos * sizeof(UChar) < src->bufused) { + U16_FWD_1_UNSAFE(s, pos); + } + return pos * sizeof(UChar); +#else + UNUSED(src); + + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR, + "no ICU lib loaded"); +#endif } /* @@ -704,6 +777,189 @@ bytes(SHIM_INTERP, ARGIN(STRING *src)) return src->bufused; } +/* + +=item C + +Get the character at C plus C. + +=cut + +*/ + +static UINTVAL +utf16_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset) +{ + ASSERT_ARGS(utf16_iter_get) +#if PARROT_HAS_ICU + UChar *s = (UChar*) str->strstart; + UINTVAL c, pos; + + pos = i->bytepos / sizeof (UChar); + if (offset > 0) { + U16_FWD_N_UNSAFE(s, pos, offset); + } + else if (offset < 0) { + U16_BACK_N_UNSAFE(s, pos, -offset); + } + U16_GET_UNSAFE(s, pos, c); + + return c; +#else + UNUSED(str); + UNUSED(i); + UNUSED(offset); + + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR, + "no ICU lib loaded"); +#endif +} + +/* + +=item C + +Moves the string iterator C by C characters. + +=cut + +*/ + +static void +utf16_iter_skip(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip) +{ + ASSERT_ARGS(utf16_iter_skip) +#if PARROT_HAS_ICU + UChar * const s = (UChar*) str->strstart; + UINTVAL pos = i->bytepos / sizeof (UChar); + + if (skip > 0) { + U16_FWD_N_UNSAFE(s, pos, skip); + } + else if (skip < 0) { + U16_BACK_N_UNSAFE(s, pos, -skip); + } + + i->charpos += skip; + i->bytepos = pos * sizeof (UChar); +#else + UNUSED(str); + UNUSED(i); + UNUSED(skip); + + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR, + "no ICU lib loaded"); +#endif +} + +/* + +=item C + +Moves the string iterator C to the next UTF-16 codepoint. + +=cut + +*/ + +PARROT_WARN_UNUSED_RESULT +static UINTVAL +utf16_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i)) +{ + ASSERT_ARGS(utf16_iter_get_and_advance) +#if PARROT_HAS_ICU + UChar *s = (UChar*) str->strstart; + UINTVAL c, pos; + pos = i->bytepos / sizeof (UChar); + /* TODO either make sure that we don't go past end or use SAFE + * iter versions + */ + U16_NEXT_UNSAFE(s, pos, c); + i->charpos++; + i->bytepos = pos * sizeof (UChar); + return c; +#else + UNUSED(str); + UNUSED(i); + + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR, + "no ICU lib loaded"); +#endif +} + +/* + +=item C + +With the string iterator C, appends the codepoint C and advances to the +next position in the string. + +=cut + +*/ + +static void +utf16_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c) +{ + ASSERT_ARGS(utf16_iter_set_and_advance) +#if PARROT_HAS_ICU + UChar *s = (UChar*) str->strstart; + UINTVAL pos; + pos = i->bytepos / sizeof (UChar); + U16_APPEND_UNSAFE(s, pos, c); + i->charpos++; + i->bytepos = pos * sizeof (UChar); +#else + UNUSED(str); + UNUSED(i); + UNUSED(c); + + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR, + "no ICU lib loaded"); +#endif +} + +/* + +=item C + +Moves the string iterator C to the position C in the string. + +=cut + +*/ + +static void +utf16_iter_set_position(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL n) +{ + ASSERT_ARGS(utf16_iter_set_position) +#if PARROT_HAS_ICU + UChar * const s = (UChar*) str->strstart; + UINTVAL pos; + pos = 0; + U16_FWD_N_UNSAFE(s, pos, n); + i->charpos = n; + i->bytepos = pos * sizeof (UChar); +#else + UNUSED(str); + UNUSED(i); + UNUSED(n); + + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR, + "no ICU lib loaded"); +#endif +} + #if PARROT_HAS_ICU /* @@ -843,7 +1099,12 @@ Parrot_encoding_utf16_init(PARROT_INTERP) codepoints, bytes, iter_init, - find_cclass + find_cclass, + utf16_iter_get, + utf16_iter_skip, + utf16_iter_get_and_advance, + utf16_iter_set_and_advance, + utf16_iter_set_position }; STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding); Parrot_register_encoding(interp, "utf16", return_encoding); diff --git a/src/string/encoding/utf8.c b/src/string/encoding/utf8.c index e81505f..fc8c262 100644 --- a/src/string/encoding/utf8.c +++ b/src/string/encoding/utf8.c @@ -170,6 +170,48 @@ static void utf8_encode_and_advance(PARROT_INTERP, __attribute__nonnull__(2) FUNC_MODIFIES(*i); +static UINTVAL utf8_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), + ARGIN(const String_iter *i), + INTVAL offset) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3); + +static UINTVAL utf8_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i)) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void utf8_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), + ARGMOD(String_iter *i), + UINTVAL c) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*str) + FUNC_MODIFIES(*i); + +static void utf8_iter_set_position(SHIM_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + UINTVAL pos) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void utf8_iter_skip(SHIM_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + INTVAL skip) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + static void utf8_set_position(SHIM_INTERP, ARGMOD(String_iter *i), UINTVAL pos) @@ -244,6 +286,24 @@ static const void * utf8_skip_forward(ARGIN(const void *ptr), UINTVAL n) #define ASSERT_ARGS_utf8_encode_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf8_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf8_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf8_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf8_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf8_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) #define ASSERT_ARGS_utf8_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(i)) #define ASSERT_ARGS_utf8_skip_backward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ @@ -456,6 +516,194 @@ utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n) /* +=item C + +Get the character at C plus C. + +=cut + +*/ + +static UINTVAL +utf8_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset) +{ + ASSERT_ARGS(utf8_iter_get) + const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos); + + if (offset > 0) { + u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr, offset); + } + else if (offset < 0) { + u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr, -offset); + } + + return utf8_decode(interp, u8ptr); +} + +/* + +=item C + +Moves the string iterator C by C characters. + +=cut + +*/ + +static void +utf8_iter_skip(SHIM_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip) +{ + ASSERT_ARGS(utf8_iter_skip) + const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos); + + if (skip > 0) { + u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr, skip); + } + else if (skip < 0) { + u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr, -skip); + } + + i->charpos += skip; + i->bytepos = (const char *)u8ptr - (const char *)str->strstart; +} + +/* + +=item C + +The UTF-8 implementation of the string iterator's C +function. + +=cut + +*/ + +static UINTVAL +utf8_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i)) +{ + ASSERT_ARGS(utf8_iter_get_and_advance) + const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos); + UINTVAL c = *u8ptr; + + if (UTF8_IS_START(c)) { + UINTVAL len = UTF8SKIP(u8ptr); + + c &= UTF8_START_MASK(len); + i->bytepos += len; + for (len--; len; len--) { + u8ptr++; + + if (!UTF8_IS_CONTINUATION(*u8ptr)) + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, + "Malformed UTF-8 string\n"); + c = UTF8_ACCUMULATE(c, *u8ptr); + } + + if (UNICODE_IS_SURROGATE(c)) + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, + "Surrogate in UTF-8 string\n"); + } + else if (!UNICODE_IS_INVARIANT(c)) { + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, + "Malformed UTF-8 string\n"); + } + else { + i->bytepos++; + } + + i->charpos++; + return c; +} + +/* + +=item C + +The UTF-8 implementation of the string iterator's C +function. + +=cut + +*/ + +static void +utf8_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c) +{ + ASSERT_ARGS(utf8_iter_set_and_advance) + unsigned char * const pos = (unsigned char *)str->strstart + i->bytepos; + unsigned char * const new_pos = (unsigned char *)utf8_encode(interp, pos, c); + + i->bytepos += (new_pos - pos); + /* XXX possible buffer overrun exception? */ + PARROT_ASSERT(i->bytepos <= Buffer_buflen(str)); + i->charpos++; +} + +/* + +=item C + +The UTF-8 implementation of the string iterator's C +function. + +=cut + +*/ + +static void +utf8_iter_set_position(SHIM_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL pos) +{ + ASSERT_ARGS(utf8_iter_set_position) + const utf8_t *u8ptr = (const utf8_t *)str->strstart; + + if (pos == 0) { + i->charpos = 0; + i->bytepos = 0; + return; + } + + /* + * we know the byte offsets of three positions: start, current and end + * now find the shortest way to reach pos + */ + if (pos < i->charpos) { + if (pos <= (i->charpos >> 1)) { + /* go forward from start */ + u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr, pos); + } + else { + /* go backward from current */ + u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr + i->bytepos, i->charpos - pos); + } + } + else { + const UINTVAL len = str->strlen; + if (pos <= i->charpos + ((len - i->charpos) >> 1)) { + /* go forward from current */ + u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr + i->bytepos, pos - i->charpos); + } + else { + /* go backward from end */ + u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr + str->bufused, len - pos); + } + } + + i->charpos = pos; + i->bytepos = (const char *)u8ptr - (const char *)str->strstart; +} + +/* + =item C The UTF-8 implementation of the string iterator's C @@ -582,8 +830,8 @@ to_encoding(PARROT_INTERP, ARGMOD(STRING *src), ARGMOD_NULLOK(STRING *dest)) { ASSERT_ARGS(to_encoding) STRING *result; - String_iter src_iter; - UINTVAL offs, dest_len, dest_pos, src_len; + const ENCODING *src_encoding; + UINTVAL dest_len, dest_pos, src_len; const int in_place = (dest == NULL); unsigned char *new_pos, *pos, *p; @@ -597,8 +845,8 @@ to_encoding(PARROT_INTERP, ARGMOD(STRING *src), ARGMOD_NULLOK(STRING *dest)) result = dest; } - /* init iter before possilby changing encoding */ - ENCODING_ITER_INIT(interp, src, &src_iter); + /* save source encoding before possibly changing it */ + src_encoding = src->encoding; result->charset = Parrot_unicode_charset_ptr; result->encoding = Parrot_utf8_encoding_ptr; result->strlen = src_len; @@ -621,12 +869,14 @@ to_encoding(PARROT_INTERP, ARGMOD(STRING *src), ARGMOD_NULLOK(STRING *dest)) result->bufused = dest_len; } else { + String_iter src_iter; + STRING_ITER_INIT(interp, &src_iter); dest_len = src_len; dest_pos = 0; - for (offs = 0; offs < src_len; ++offs) { - const UINTVAL c = src_iter.get_and_advance(interp, &src_iter); + while (src_iter.charpos < src_len) { + const UINTVAL c = src_encoding->iter_get_and_advance(interp, src, &src_iter); if (dest_len - dest_pos < 6) { - UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5); + UINTVAL need = (UINTVAL)((src->strlen - src_iter.charpos + 1) * 1.5); if (need < 16) need = 16; dest_len += need; @@ -790,16 +1040,16 @@ get_codepoints(PARROT_INTERP, ARGIN(STRING *src), UINTVAL offset, UINTVAL count) String_iter iter; UINTVAL start; - iter_init(interp, src, &iter); + STRING_ITER_INIT(interp, &iter); if (offset) - iter.set_position(interp, &iter, offset); + utf8_iter_set_position(interp, src, &iter, offset); start = iter.bytepos; return_string->strstart = (char *)return_string->strstart + start; if (count) - iter.set_position(interp, &iter, offset + count); + utf8_iter_set_position(interp, src, &iter, offset + count); return_string->bufused = iter.bytepos - start; return_string->strlen = count; @@ -860,13 +1110,13 @@ get_codepoints_inplace(PARROT_INTERP, ARGMOD(STRING *src), UINTVAL start; Parrot_str_reuse_COW(interp, src, return_string); - iter_init(interp, src, &iter); - iter.set_position(interp, &iter, offset); + STRING_ITER_INIT(interp, &iter); + utf8_iter_set_position(interp, src, &iter, offset); start = iter.bytepos; return_string->strstart = (char *)return_string->strstart + start; - iter.set_position(interp, &iter, offset + count); + utf8_iter_set_position(interp, src, &iter, offset + count); return_string->bufused = iter.bytepos - start; return_string->strlen = count; @@ -973,9 +1223,9 @@ codepoints(PARROT_INTERP, ARGMOD(STRING *src)) * this is used to initially calculate src->strlen, * therefore we must scan the whole string */ - iter_init(interp, src, &iter); + STRING_ITER_INIT(interp, &iter); while (iter.bytepos < src->bufused) - iter.get_and_advance(interp, &iter); + utf8_iter_get_and_advance(interp, src, &iter); return iter.charpos; } @@ -1055,7 +1305,12 @@ Parrot_encoding_utf8_init(PARROT_INTERP) codepoints, bytes, iter_init, - find_cclass + find_cclass, + utf8_iter_get, + utf8_iter_skip, + utf8_iter_get_and_advance, + utf8_iter_set_and_advance, + utf8_iter_set_position }; STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding); Parrot_register_encoding(interp, "utf8", return_encoding);