diff --git a/include/parrot/encoding.h b/include/parrot/encoding.h index 84d42eb..5965ad6 100644 --- a/include/parrot/encoding.h +++ b/include/parrot/encoding.h @@ -32,6 +32,16 @@ struct string_iterator_t; /* s. parrot/string.h */ typedef void (*encoding_iter_init_t)(PARROT_INTERP, const STRING *src, struct string_iterator_t *); +typedef UINTVAL (*encoding_iter_get_t)( + PARROT_INTERP, const STRING *str, const String_iter *i, INTVAL offset); +typedef void (*encoding_iter_skip_t)( + PARROT_INTERP, const STRING *str, String_iter *i, INTVAL skip); +typedef UINTVAL (*encoding_iter_get_and_advance_t)( + PARROT_INTERP, const STRING *str, String_iter *i); +typedef void (*encoding_iter_set_and_advance_t)( + PARROT_INTERP, STRING *str, String_iter *i, UINTVAL c); +typedef void (*encoding_iter_set_position_t)( + PARROT_INTERP, const STRING *str, String_iter *i, UINTVAL pos); struct _encoding { ARGIN(const char *name); @@ -47,6 +57,11 @@ struct _encoding { encoding_iter_init_t iter_init; encoding_find_cclass_t find_cclass; encoding_hash_t hash; + encoding_iter_get_t iter_get; + encoding_iter_skip_t iter_skip; + encoding_iter_get_and_advance_t iter_get_and_advance; + encoding_iter_set_and_advance_t iter_set_and_advance; + encoding_iter_set_position_t iter_set_position; }; typedef struct _encoding ENCODING; diff --git a/include/parrot/string.h b/include/parrot/string.h index 8914db3..d02f5c1 100644 --- a/include/parrot/string.h +++ b/include/parrot/string.h @@ -38,6 +38,19 @@ typedef struct string_iterator_t { void (*set_position)(PARROT_INTERP, struct string_iterator_t *i, UINTVAL pos); } String_iter; +#define STRING_ITER_INIT(i, iter) \ + (iter)->charpos = (iter)->bytepos = 0 +#define STRING_ITER_GET(i, str, iter, offset) \ + ((str)->encoding)->iter_get((i), (str), (iter), (offset)) +#define STRING_ITER_SKIP(i, str, iter, skip) \ + ((str)->encoding)->iter_skip((i), (str), (iter), (skip)) +#define STRING_ITER_GET_AND_ADVANCE(i, str, iter) \ + ((str)->encoding)->iter_get_and_advance((i), (str), (iter)) +#define STRING_ITER_SET_AND_ADVANCE(i, str, iter, c) \ + ((str)->encoding)->iter_set_and_advance((i), (str), (iter), (c)) +#define STRING_ITER_SET_POSITION(i, str, iter, pos) \ + ((str)->encoding)->iter_set_position((i), (str), (iter), (pos)) + #define STREQ(x, y) (strcmp((x), (y))==0) #define STRNEQ(x, y) (strcmp((x), (y))!=0) diff --git a/include/parrot/string_funcs.h b/include/parrot/string_funcs.h index 7c35265..63501fe 100644 --- a/include/parrot/string_funcs.h +++ b/include/parrot/string_funcs.h @@ -226,6 +226,31 @@ PARROT_PURE_FUNCTION INTVAL Parrot_str_is_null(SHIM_INTERP, ARGIN_NULLOK(const STRING *s)); PARROT_EXPORT +INTVAL Parrot_str_iter_index(PARROT_INTERP, + ARGIN(const STRING *src), + ARGMOD(String_iter *start), + ARGOUT(String_iter *end), + ARGIN(const STRING *search)) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + __attribute__nonnull__(4) + __attribute__nonnull__(5) + FUNC_MODIFIES(*start) + FUNC_MODIFIES(*end); + +PARROT_EXPORT +PARROT_CANNOT_RETURN_NULL +PARROT_WARN_UNUSED_RESULT +STRING * Parrot_str_iter_substr(PARROT_INTERP, + ARGIN(const STRING *str), + ARGIN(const String_iter *l), + ARGIN_NULLOK(const String_iter *r)) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3); + +PARROT_EXPORT PARROT_WARN_UNUSED_RESULT PARROT_CANNOT_RETURN_NULL STRING* Parrot_str_join(PARROT_INTERP, @@ -559,6 +584,16 @@ STRING* Parrot_str_from_uint(PARROT_INTERP, PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(s)) #define ASSERT_ARGS_Parrot_str_is_null __attribute__unused__ int _ASSERT_ARGS_CHECK = (0) +#define ASSERT_ARGS_Parrot_str_iter_index __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(src) \ + , PARROT_ASSERT_ARG(start) \ + , PARROT_ASSERT_ARG(end) \ + , PARROT_ASSERT_ARG(search)) +#define ASSERT_ARGS_Parrot_str_iter_substr __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(l)) #define ASSERT_ARGS_Parrot_str_join __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(ar)) diff --git a/src/io/utf8.c b/src/io/utf8.c index a4dc6a4..1c5b034 100644 --- a/src/io/utf8.c +++ b/src/io/utf8.c @@ -57,7 +57,7 @@ Parrot_io_read_utf8(PARROT_INTERP, ARGMOD(PMC *filehandle), s->encoding = Parrot_utf8_encoding_ptr; /* count chars, verify utf8 */ - Parrot_utf8_encoding_ptr->iter_init(interp, s, &iter); + STRING_ITER_INIT(interp, &iter); while (iter.bytepos < s->bufused) { if (iter.bytepos + 4 > s->bufused) { @@ -84,8 +84,6 @@ Parrot_io_read_utf8(PARROT_INTERP, ARGMOD(PMC *filehandle), s->strlen = iter.charpos; s = Parrot_str_concat(interp, s, s2); - /* String is updated. Poke into iterator to replace old string */ - iter.str = s; *buf = s; len += len2 + 1; @@ -93,7 +91,7 @@ Parrot_io_read_utf8(PARROT_INTERP, ARGMOD(PMC *filehandle), } } ok: - iter.get_and_advance(interp, &iter); + Parrot_utf8_encoding_ptr->iter_get_and_advance(interp, *buf, &iter); } s->strlen = iter.charpos; return len; diff --git a/src/pmc/stringiterator.pmc b/src/pmc/stringiterator.pmc index ce9001c..bda81b7 100644 --- a/src/pmc/stringiterator.pmc +++ b/src/pmc/stringiterator.pmc @@ -27,11 +27,9 @@ Implementation of Iterator for String PMC. /* HEADERIZER END: static */ pmclass StringIterator auto_attrs extends Iterator { - ATTR PMC *string; /* String to iterate over */ - ATTR INTVAL pos; /* Current position of iterator for forward iterator */ - /* Previous position of iterator for reverse iterator */ - ATTR INTVAL length; /* Length of C */ - ATTR INTVAL reverse; /* Direction of iteration. 1 - for reverse iteration */ + ATTR STRING *str_val; /* String to iterate over */ + ATTR String_iter iter; /* String iterator */ + ATTR INTVAL reverse; /* Direction of iteration. 1 - for reverse iteration */ /* @@ -43,10 +41,13 @@ Initialize StringIterator. */ VTABLE void init_pmc(PMC *string) { - SET_ATTR_string(INTERP, SELF, string); + String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter; + STRING * const str_val = VTABLE_get_string(INTERP, string); + + SET_ATTR_str_val(INTERP, SELF, str_val); + STRING_ITER_INIT(INTERP, iter); + SET_ATTR_reverse(INTERP, SELF, ITERATE_FROM_START); - /* by default, iterate from start */ - SELF.set_integer_native(ITERATE_FROM_START); PObj_custom_mark_SET(SELF); } @@ -61,9 +62,10 @@ Marks the current idx/key and the aggregate as live. */ VTABLE void mark() { - PMC *string; - GET_ATTR_string(INTERP, SELF, string); - Parrot_gc_mark_PMC_alive(INTERP, string); + STRING *str_val; + + GET_ATTR_str_val(INTERP, SELF, str_val); + Parrot_gc_mark_STRING_alive(INTERP, str_val); } /* @@ -74,15 +76,21 @@ Marks the current idx/key and the aggregate as live. */ VTABLE PMC* clone() { - Parrot_StringIterator_attributes * const attrs = - PARROT_STRINGITERATOR(SELF); - PMC * const clone = - Parrot_pmc_new_init(INTERP, enum_class_StringIterator, attrs->string); - Parrot_StringIterator_attributes * const clone_attrs = - PARROT_STRINGITERATOR(clone); - - clone_attrs->pos = attrs->pos; - clone_attrs->reverse = attrs->reverse; + String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter; + PMC *clone, *str_pmc; + String_iter *clone_iter; + STRING *str_val; + INTVAL reverse; + + str_pmc = Parrot_pmc_new(INTERP, enum_class_String); + GET_ATTR_str_val(INTERP, SELF, str_val); + VTABLE_set_string_native(INTERP, str_pmc, str_val); + clone = Parrot_pmc_new_init(INTERP, enum_class_StringIterator, str_pmc); + clone_iter = &PARROT_STRINGITERATOR(clone)->iter; + *clone_iter = *iter; + GET_ATTR_reverse(INTERP, SELF, reverse); + SET_ATTR_reverse(INTERP, clone, reverse); + return clone; } @@ -111,12 +119,17 @@ Returns the number of remaining elements in the C. */ VTABLE INTVAL elements() { - Parrot_StringIterator_attributes * const attrs = - PARROT_STRINGITERATOR(SELF); - if (attrs->reverse) - return attrs->pos; + String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter; + STRING *str_val; + INTVAL reverse; + + GET_ATTR_str_val(INTERP, SELF, str_val); + GET_ATTR_reverse(INTERP, SELF, reverse); + + if (reverse) + return iter->charpos; else - return attrs->length - attrs->pos; + return str_val->strlen - iter->charpos; } VTABLE INTVAL get_integer() { @@ -137,20 +150,19 @@ Reset the Iterator. C must be one of */ VTABLE void set_integer_native(INTVAL value) { - Parrot_StringIterator_attributes * const attrs = - PARROT_STRINGITERATOR(SELF); - switch (value) { - case ITERATE_FROM_START: - attrs->reverse = 0; - attrs->pos = 0; - attrs->length = VTABLE_elements(INTERP, attrs->string); - break; - case ITERATE_FROM_END: - attrs->reverse = 1; - attrs->pos = attrs->length - = VTABLE_elements(INTERP, attrs->string); - break; - default: + STRING *str_val; + String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter; + + GET_ATTR_str_val(INTERP, SELF, str_val); + if (value == ITERATE_FROM_START) { + SET_ATTR_reverse(INTERP, SELF, 0); + STRING_ITER_SET_POSITION(INTERP, str_val, iter, 0); + } + else if (value == ITERATE_FROM_END) { + SET_ATTR_reverse(INTERP, SELF, 1); + STRING_ITER_SET_POSITION(INTERP, str_val, iter, str_val->strlen); + } + else { Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_INVALID_OPERATION, "Wrong direction for StringIterator"); } @@ -167,9 +179,13 @@ Returns this Iterator's string. */ VTABLE PMC *get_pmc() { - PMC *string; - GET_ATTR_string(INTERP, SELF, string); - return string ? string : PMCNULL; + PMC * const string = Parrot_pmc_new(INTERP, Parrot_get_ctx_HLL_type( + interp, enum_class_String)); + STRING *str_val; + + GET_ATTR_str_val(INTERP, SELF, str_val); + VTABLE_set_string_native(interp, string, str_val); + return string; } /* @@ -182,17 +198,20 @@ Shift next character from C as PMC. */ VTABLE PMC *shift_pmc() { - Parrot_StringIterator_attributes * const attrs = - PARROT_STRINGITERATOR(SELF); + String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter; PMC *ret; + STRING *str_val, *substr; + const String_iter old_iter = *iter; - if (attrs->pos >= attrs->length) + GET_ATTR_str_val(INTERP, SELF, str_val); + if (iter->charpos >= str_val->strlen) Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, "StopIteration"); ret = Parrot_pmc_new(INTERP, Parrot_get_ctx_HLL_type(interp, enum_class_String)); - VTABLE_set_string_native(INTERP, ret, - VTABLE_get_string_keyed_int(INTERP, attrs->string, attrs->pos++)); + STRING_ITER_SKIP(INTERP, str_val, iter, 1); + substr = Parrot_str_iter_substr(INTERP, str_val, &old_iter, iter); + VTABLE_set_string_native(INTERP, ret, substr); return ret; } @@ -206,14 +225,17 @@ Shift next character from C. */ VTABLE STRING *shift_string() { - Parrot_StringIterator_attributes * const attrs = - PARROT_STRINGITERATOR(SELF); + String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter; + STRING *str_val; + const String_iter old_iter = *iter; - if (attrs->pos >= attrs->length) + GET_ATTR_str_val(INTERP, SELF, str_val); + if (iter->charpos >= str_val->strlen) Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, "StopIteration"); - return VTABLE_get_string_keyed_int(INTERP, attrs->string, attrs->pos++); + STRING_ITER_SKIP(INTERP, str_val, iter, 1); + return Parrot_str_iter_substr(INTERP, str_val, &old_iter, iter); } /* @@ -226,14 +248,15 @@ Shift next character code from C. */ VTABLE INTVAL shift_integer() { - Parrot_StringIterator_attributes * const attrs = - PARROT_STRINGITERATOR(SELF); + String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter; + STRING *str_val; - if (attrs->pos >= attrs->length) + GET_ATTR_str_val(INTERP, SELF, str_val); + if (iter->charpos >= str_val->strlen) Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, "StopIteration"); - return VTABLE_get_integer_keyed_int(INTERP, attrs->string, attrs->pos++); + return STRING_ITER_GET_AND_ADVANCE(INTERP, str_val, iter); } /* @@ -246,17 +269,21 @@ Shift "next" character from C for reverse iterator as PMC. */ VTABLE PMC *pop_pmc() { - Parrot_StringIterator_attributes * const attrs = - PARROT_STRINGITERATOR(SELF); + String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter; + STRING *str_val, *substr; PMC *ret; + const String_iter old_iter = *iter; - if (!STATICSELF.get_bool()) + GET_ATTR_str_val(INTERP, SELF, str_val); + /* Shouldn't this test be (iter->charpos <= 0) ? */ + if (SELF.elements() <= 0) Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, "StopIteration"); ret = Parrot_pmc_new(INTERP, Parrot_get_ctx_HLL_type(interp, enum_class_String)); - VTABLE_set_string_native(INTERP, ret, - VTABLE_get_string_keyed_int(INTERP, attrs->string, --attrs->pos)); + STRING_ITER_SKIP(INTERP, str_val, iter, -1); + substr = Parrot_str_iter_substr(INTERP, str_val, iter, &old_iter); + VTABLE_set_string_native(INTERP, ret, substr); return ret; } @@ -270,14 +297,18 @@ Shift "next" character from C for reverse iterator. */ VTABLE STRING *pop_string() { - Parrot_StringIterator_attributes * const attrs = - PARROT_STRINGITERATOR(SELF); + String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter; + STRING *str_val; + const String_iter old_iter = *iter; - if (!STATICSELF.get_bool()) + GET_ATTR_str_val(INTERP, SELF, str_val); + /* Shouldn't this test be (iter->charpos <= 0) ? */ + if (SELF.elements() <= 0) Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, "StopIteration"); - return VTABLE_get_string_keyed_int(INTERP, attrs->string, --attrs->pos); + STRING_ITER_SKIP(INTERP, str_val, iter, -1); + return Parrot_str_iter_substr(INTERP, str_val, iter, &old_iter); } /* @@ -290,14 +321,17 @@ Shift "next" character code from C for reverse iterator. */ VTABLE INTVAL pop_integer() { - Parrot_StringIterator_attributes * const attrs = - PARROT_STRINGITERATOR(SELF); + String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter; + STRING *str_val; - if (!STATICSELF.get_bool()) + GET_ATTR_str_val(INTERP, SELF, str_val); + /* Shouldn't this test be (iter->charpos <= 0) ? */ + if (SELF.elements() <= 0) Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, "StopIteration"); - return VTABLE_get_integer_keyed_int(INTERP, attrs->string, --attrs->pos); + STRING_ITER_SKIP(INTERP, str_val, iter, -1); + return STRING_ITER_GET(INTERP, str_val, iter, 0); } /* @@ -311,8 +345,16 @@ Get integer value of current position plus idx. */ VTABLE INTVAL get_integer_keyed_int(INTVAL idx) { - return VTABLE_get_integer_keyed_int(INTERP, STATICSELF.get_pmc(), - PARROT_STRINGITERATOR(SELF)->pos + idx); + String_iter * const iter = &PARROT_STRINGITERATOR(SELF)->iter; + STRING *str_val; + const UINTVAL offset = iter->charpos + idx; + + GET_ATTR_str_val(INTERP, SELF, str_val); + if (offset >= str_val->strlen) + Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, + "StopIteration"); + + return STRING_ITER_GET(INTERP, str_val, iter, idx); } /* @@ -326,8 +368,22 @@ Get string value of current position plus idx. */ VTABLE STRING *get_string_keyed_int(INTVAL idx) { - return VTABLE_get_string_keyed_int(INTERP, STATICSELF.get_pmc(), - PARROT_STRINGITERATOR(SELF)->pos + idx); + String_iter iter = PARROT_STRINGITERATOR(SELF)->iter; + String_iter next_iter; + STRING *str_val; + const UINTVAL offset = iter.charpos + idx; + + GET_ATTR_str_val(INTERP, SELF, str_val); + if (offset >= str_val->strlen) + Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS, + "StopIteration"); + + if (idx != 0) + STRING_ITER_SKIP(INTERP, str_val, &iter, idx); + next_iter = iter; + STRING_ITER_SKIP(INTERP, str_val, &next_iter, 1); + + return Parrot_str_iter_substr(INTERP, str_val, &iter, &next_iter); } } diff --git a/src/string/api.c b/src/string/api.c index e37354e..e2a558e 100644 --- a/src/string/api.c +++ b/src/string/api.c @@ -1104,6 +1104,104 @@ Parrot_str_substr(PARROT_INTERP, return CHARSET_GET_CODEPOINTS(interp, src, true_offset, true_length); } +/* + +=item C + +Returns the substring between iterators C and C. + +=cut + +*/ + +PARROT_EXPORT +PARROT_CANNOT_RETURN_NULL +PARROT_WARN_UNUSED_RESULT +STRING * +Parrot_str_iter_substr(PARROT_INTERP, + ARGIN(const STRING *str), + ARGIN(const String_iter *l), ARGIN_NULLOK(const String_iter *r)) +{ + ASSERT_ARGS(Parrot_str_iter_substr) + STRING *dest = Parrot_str_copy(interp, str); + + dest->strstart = (char *)dest->strstart + l->bytepos; + + if (r == NULL) { + dest->bufused = str->bufused - l->bytepos; + dest->strlen = str->strlen - l->charpos; + } + else { + dest->bufused = r->bytepos - l->bytepos; + dest->strlen = r->charpos - l->charpos; + } + + dest->hashval = 0; + + return dest; +} + +/* + +=item C + +Find the next occurence of STRING C in STRING C starting at +String_iter C. If C is found C is modified to mark the +beginning of C and String_iter C is set to the character after +C in C. Returns the character position where C was found +or -1 if it wasn't found. + +=cut + +*/ + +PARROT_EXPORT +INTVAL +Parrot_str_iter_index(PARROT_INTERP, + ARGIN(const STRING *src), + ARGMOD(String_iter *start), ARGOUT(String_iter *end), + ARGIN(const STRING *search)) +{ + ASSERT_ARGS(Parrot_str_iter_index) + String_iter search_iter, search_start, next_start; + const UINTVAL len = search->strlen; + UINTVAL c0; + + if (len == 0) { + *end = *start; + return start->charpos; + } + + STRING_ITER_INIT(interp, &search_iter); + c0 = STRING_ITER_GET_AND_ADVANCE(interp, search, &search_iter); + search_start = search_iter; + next_start = *start; + + while (start->charpos + len <= src->strlen) { + UINTVAL c1 = STRING_ITER_GET_AND_ADVANCE(interp, src, &next_start); + + if (c1 == c0) { + UINTVAL c2; + *end = next_start; + + do { + if (search_iter.charpos >= len) + return start->charpos; + c1 = STRING_ITER_GET_AND_ADVANCE(interp, src, end); + c2 = STRING_ITER_GET_AND_ADVANCE(interp, search, &search_iter); + } while (c1 == c2); + + search_iter = search_start; + } + + *start = next_start; + } + + return -1; +} + /* @@ -1145,7 +1243,7 @@ Parrot_str_replace(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL true_offset = (UINTVAL)offset; UINTVAL true_length = (UINTVAL)length; - UINTVAL start_byte, end_byte; + UINTVAL start_byte, end_byte, start_char, end_char; INTVAL buf_size; if (STRING_IS_NULL(src)) { @@ -1181,13 +1279,15 @@ Parrot_str_replace(PARROT_INTERP, ARGIN(const STRING *src), } /* get byte position of the part that will be replaced */ - ENCODING_ITER_INIT(interp, src, &iter); + STRING_ITER_INIT(interp, &iter); - iter.set_position(interp, &iter, true_offset); + STRING_ITER_SET_POSITION(interp, src, &iter, true_offset); start_byte = iter.bytepos; + start_char = iter.charpos; - iter.set_position(interp, &iter, true_offset + true_length); + STRING_ITER_SKIP(interp, src, &iter, true_length); end_byte = iter.bytepos; + end_char = iter.charpos; /* not possible.... */ if (end_byte < start_byte) @@ -1226,7 +1326,7 @@ Parrot_str_replace(PARROT_INTERP, ARGIN(const STRING *src), (char *)src->strstart + end_byte, src->bufused - end_byte); - dest->strlen = CHARSET_CODEPOINTS(interp, dest); + dest->strlen = src->strlen - (end_char - start_char) + rep->strlen; dest->hashval = 0; return dest; @@ -1252,7 +1352,7 @@ Parrot_str_chopn(PARROT_INTERP, ARGIN(const STRING *s), INTVAL n) ASSERT_ARGS(Parrot_str_chopn) STRING * const chopped = Parrot_str_copy(interp, s); - UINTVAL new_length, uchar_size; + UINTVAL new_length; if (n < 0) { new_length = -n; @@ -1273,23 +1373,23 @@ Parrot_str_chopn(PARROT_INTERP, ARGIN(const STRING *s), INTVAL n) return chopped; } - uchar_size = chopped->bufused / chopped->strlen; - chopped->strlen = new_length; - if (chopped->encoding == Parrot_fixed_8_encoding_ptr) { chopped->bufused = new_length; } else if (chopped->encoding == Parrot_ucs2_encoding_ptr) { + const UINTVAL uchar_size = chopped->bufused / chopped->strlen; chopped->bufused = new_length * uchar_size; } else { String_iter iter; - ENCODING_ITER_INIT(interp, s, &iter); - iter.set_position(interp, &iter, new_length); + STRING_ITER_INIT(interp, &iter); + STRING_ITER_SET_POSITION(interp, s, &iter, new_length); chopped->bufused = iter.bytepos; } + chopped->strlen = new_length; + return chopped; } @@ -1860,13 +1960,12 @@ Parrot_str_to_int(PARROT_INTERP, ARGIN_NULLOK(const STRING *s)) int sign = 1; UINTVAL i = 0; String_iter iter; - UINTVAL offs; number_parse_state state = parse_start; - ENCODING_ITER_INIT(interp, s, &iter); + STRING_ITER_INIT(interp, &iter); - for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) { - const UINTVAL c = iter.get_and_advance(interp, &iter); + while (state != parse_end && iter.charpos < s->strlen) { + const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, s, &iter); /* Check for overflow */ if (c > 255) break; @@ -1956,17 +2055,16 @@ Parrot_str_to_num(PARROT_INTERP, ARGIN(const STRING *s)) int d_length = 0; int check_nan = 0; /* Check for NaN and Inf after main loop */ String_iter iter; - UINTVAL offs; number_parse_state state = parse_start; if (STRING_IS_NULL(s)) return 0.0; - ENCODING_ITER_INIT(interp, s, &iter); + STRING_ITER_INIT(interp, &iter); /* Handcrafter FSM to read float value */ - for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) { - const UINTVAL c = iter.get_and_advance(interp, &iter); + while (state != parse_end && iter.charpos < s->strlen) { + const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, s, &iter); /* Check for overflow */ if (c > 255) break; @@ -2417,11 +2515,11 @@ Parrot_str_escape_truncate(PARROT_INTERP, Parrot_fixed_8_encoding_ptr, Parrot_ascii_charset_ptr, 0); /* more work TODO */ - ENCODING_ITER_INIT(interp, src, &iter); + STRING_ITER_INIT(interp, &iter); dp = (unsigned char *)result->strstart; for (i = 0; len > 0; --len) { - UINTVAL c = iter.get_and_advance(interp, &iter); + UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); if (c < 0x7f) { /* process ASCII chars */ if (i >= charlen - 2) { @@ -2561,17 +2659,17 @@ Parrot_str_unescape_string(PARROT_INTERP, ARGIN(const STRING *src), Parrot_gc_allocate_string_storage(interp, result, reserved); result->bufused = reserved; - src->encoding->iter_init(interp, src, &itersrc); - encoding->iter_init(interp, result, &iterdest); + STRING_ITER_INIT(interp, &itersrc); + STRING_ITER_INIT(interp, &iterdest); while (itersrc.bytepos < srclen) { - INTVAL c = itersrc.get_and_advance(interp, &itersrc); + INTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc); INTVAL next; do { pending = 0; next = c; if (c == '\\') { - c = itersrc.get_and_advance(interp, &itersrc); + c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc); switch (c) { /* Common one char sequences */ case 'a': next = '\a'; break; @@ -2584,7 +2682,7 @@ Parrot_str_unescape_string(PARROT_INTERP, ARGIN(const STRING *src), case 'e': next = '\e'; break; /* Escape character */ case 'c': - c = itersrc.get_and_advance(interp, &itersrc); + c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc); /* This assumes ascii-alike encoding */ if (c < 'A' || c > 'Z') throw_illegal_escape(interp); @@ -2592,11 +2690,11 @@ Parrot_str_unescape_string(PARROT_INTERP, ARGIN(const STRING *src), break; case 'x': digcount = 0; - c = itersrc.get_and_advance(interp, &itersrc); + c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc); if (c == '{') { /* \x{h..h} 1..8 hex digits */ while (itersrc.bytepos < srclen) { - c = itersrc.get_and_advance(interp, &itersrc); + c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc); if (c == '}') break; if (!isxdigit(c)) @@ -2620,7 +2718,7 @@ Parrot_str_unescape_string(PARROT_INTERP, ARGIN(const STRING *src), pending = 0; break; } - c = itersrc.get_and_advance(interp, &itersrc); + c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc); } } if (digcount == 0) @@ -2631,7 +2729,7 @@ Parrot_str_unescape_string(PARROT_INTERP, ARGIN(const STRING *src), case 'u': /* \uhhhh 4 hex digits */ for (digcount = 0; digcount < 4; ++digcount) { - c = itersrc.get_and_advance(interp, &itersrc); + c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc); if (!isxdigit(c)) throw_illegal_escape(interp); digbuf[digcount] = c; @@ -2642,7 +2740,7 @@ Parrot_str_unescape_string(PARROT_INTERP, ARGIN(const STRING *src), case 'U': /* \Uhhhhhhhh 8 hex digits */ for (digcount = 0; digcount < 8; ++digcount) { - c = itersrc.get_and_advance(interp, &itersrc); + c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc); if (!isxdigit(c)) throw_illegal_escape(interp); digbuf[digcount] = c; @@ -2655,7 +2753,7 @@ Parrot_str_unescape_string(PARROT_INTERP, ARGIN(const STRING *src), /* \ooo 1..3 oct digits */ digbuf[0] = c; for (digcount = 1; digcount < 3; ++digcount) { - c = itersrc.get_and_advance(interp, &itersrc); + c = STRING_ITER_GET_AND_ADVANCE(interp, src, &itersrc); if (c < '0' || c > '7') break; digbuf[digcount] = c; @@ -2669,7 +2767,7 @@ Parrot_str_unescape_string(PARROT_INTERP, ARGIN(const STRING *src), next = c; } } - iterdest.set_and_advance(interp, &iterdest, next); + STRING_ITER_SET_AND_ADVANCE(interp, result, &iterdest, next); } while (pending); } result->bufused = iterdest.bytepos; @@ -2750,7 +2848,7 @@ Parrot_str_unescape(PARROT_INTERP, encoding = result->encoding; } - encoding->iter_init(interp, result, &iter); + STRING_ITER_INIT(interp, &iter); for (offs = d = 0; offs < clength; ++offs) { r = (Parrot_UInt4)((unsigned char *)result->strstart)[offs]; @@ -2773,7 +2871,7 @@ Parrot_str_unescape(PARROT_INTERP, } PARROT_ASSERT(d < offs); - iter.set_and_advance(interp, &iter, r); + encoding->iter_set_and_advance(interp, result, &iter, r); ++d; } @@ -3271,8 +3369,10 @@ Parrot_str_split(PARROT_INTERP, ARGIN_NULLOK(const STRING *delim), ARGIN_NULLOK(STRING *str)) { ASSERT_ARGS(Parrot_str_split) - PMC *res; - INTVAL slen, dlen, ps, pe; + PMC *res; + STRING *tstr; + UINTVAL slen, dlen; + String_iter iter; if (STRING_IS_NULL(delim) || STRING_IS_NULL(str)) return PMCNULL; @@ -3284,44 +3384,38 @@ Parrot_str_split(PARROT_INTERP, if (!slen) return res; + STRING_ITER_INIT(interp, &iter); dlen = Parrot_str_length(interp, delim); if (dlen == 0) { - int i; VTABLE_set_integer_native(interp, res, slen); - for (i = 0; i < slen; ++i) { - STRING * const p = Parrot_str_substr(interp, str, i, 1); - VTABLE_set_string_keyed_int(interp, res, i, p); - } - - return res; - } + do { + const String_iter old_iter = iter; - pe = Parrot_str_find_index(interp, str, delim, 0); + STRING_ITER_SKIP(interp, str, &iter, 1); + tstr = Parrot_str_iter_substr(interp, str, &old_iter, &iter); + VTABLE_set_string_keyed_int(interp, res, old_iter.charpos, tstr); + } while (iter.charpos < slen); - if (pe < 0) { - VTABLE_push_string(interp, res, str); return res; } - ps = 0; - - while (ps <= slen) { - const int pl = pe - ps; - STRING * const tstr = Parrot_str_substr(interp, str, ps, pl); - - VTABLE_push_string(interp, res, tstr); - ps = pe + Parrot_str_length(interp, delim); + do { + String_iter start, end; + INTVAL pos; - if (ps > slen) + start = iter; + if (Parrot_str_iter_index(interp, str, &start, &end, delim) < 0) break; - pe = Parrot_str_find_index(interp, str, delim, ps); + tstr = Parrot_str_iter_substr(interp, str, &iter, &start); + VTABLE_push_string(interp, res, tstr); + iter = end; + } while (iter.charpos < slen); - if (pe < 0) - pe = slen; - } + tstr = Parrot_str_iter_substr(interp, str, &iter, NULL); + VTABLE_push_string(interp, res, tstr); return res; } diff --git a/src/string/charset/ascii.c b/src/string/charset/ascii.c index 5c8371a..761a60e 100644 --- a/src/string/charset/ascii.c +++ b/src/string/charset/ascii.c @@ -201,7 +201,6 @@ to_ascii(PARROT_INTERP, ARGIN(const STRING *src)) { ASSERT_ARGS(to_ascii) String_iter iter; - UINTVAL offs; unsigned char *p; const UINTVAL len = src->strlen; @@ -209,9 +208,9 @@ to_ascii(PARROT_INTERP, ARGIN(const STRING *src)) STRING * const dest = Parrot_str_clone(interp, src); p = (unsigned char *)dest->strstart; - ENCODING_ITER_INIT(interp, src, &iter); - for (offs = 0; offs < len; ++offs) { - const UINTVAL c = iter.get_and_advance(interp, &iter); + STRING_ITER_INIT(interp, &iter); + while (iter.charpos < len) { + const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); if (c >= 128) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION, "can't convert unicode string to ascii"); @@ -493,11 +492,10 @@ ascii_compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs)) return ret_val < 0 ? -1 : 1; } else { - UINTVAL offs; - ENCODING_ITER_INIT(interp, rhs, &iter); - for (offs = 0; offs < min_len; ++offs) { - const UINTVAL cl = ENCODING_GET_BYTE(interp, lhs, offs); - const UINTVAL cr = iter.get_and_advance(interp, &iter); + STRING_ITER_INIT(interp, &iter); + while (iter.charpos < min_len) { + const UINTVAL cl = ENCODING_GET_BYTE(interp, lhs, iter.charpos); + const UINTVAL cr = STRING_ITER_GET_AND_ADVANCE(interp, rhs, &iter); if (cl != cr) return cl < cr ? -1 : 1; } @@ -531,35 +529,12 @@ mixed_cs_index(PARROT_INTERP, ARGIN(const STRING *src), ARGIN(const STRING *sear UINTVAL offs) { ASSERT_ARGS(mixed_cs_index) + String_iter start, end; - if (search->strlen <= src->strlen) { - String_iter src_iter, search_iter; - const UINTVAL maxpos = src->strlen - search->strlen + 1; - const UINTVAL cfirst = Parrot_str_indexed(interp, search, 0); - - ENCODING_ITER_INIT(interp, src, &src_iter); - src_iter.set_position(interp, &src_iter, offs); - ENCODING_ITER_INIT(interp, search, &search_iter); - - while (src_iter.charpos < maxpos) { - if (cfirst == src_iter.get_and_advance(interp, &src_iter)) { - const INTVAL next_pos = src_iter.charpos; - const INTVAL next_byte = src_iter.bytepos; - UINTVAL len; - search_iter.set_position(interp, &search_iter, 1); - for (len = search->strlen - 1; len; --len) { - if ((src_iter.get_and_advance(interp, &src_iter)) != - (search_iter.get_and_advance(interp, &search_iter))) - break; - } - if (len == 0) - return next_pos - 1; - src_iter.charpos = next_pos; - src_iter.bytepos = next_byte; - } - } - } - return -1; + STRING_ITER_INIT(interp, &start); + STRING_ITER_SET_POSITION(interp, src, &start, offs); + + return Parrot_str_iter_index(interp, src, &start, &end, search); } /* @@ -638,13 +613,12 @@ static UINTVAL validate(PARROT_INTERP, ARGIN(const STRING *src)) { ASSERT_ARGS(validate) - INTVAL offset; String_iter iter; const INTVAL length = Parrot_str_length(interp, src); - ENCODING_ITER_INIT(interp, src, &iter); - for (offset = 0; offset < length; ++offset) { - const UINTVAL codepoint = iter.get_and_advance(interp, &iter); + STRING_ITER_INIT(interp, &iter); + while (iter.charpos < length) { + const UINTVAL codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); if (codepoint >= 0x80) return 0; } diff --git a/src/string/charset/iso-8859-1.c b/src/string/charset/iso-8859-1.c index 8e965fa..b795e0d 100644 --- a/src/string/charset/iso-8859-1.c +++ b/src/string/charset/iso-8859-1.c @@ -178,24 +178,24 @@ static STRING * to_iso_8859_1(PARROT_INTERP, ARGIN(const STRING *src)) { ASSERT_ARGS(to_iso_8859_1) - UINTVAL offs, src_len; + UINTVAL src_len; String_iter iter; /* iso-8859-1 is never bigger then source */ STRING * dest = Parrot_str_clone(interp, src); - ENCODING_ITER_INIT(interp, src, &iter); + STRING_ITER_INIT(interp, &iter); src_len = src->strlen; dest->bufused = src_len; - dest->charset = Parrot_iso_8859_1_charset_ptr; - dest->encoding = Parrot_fixed_8_encoding_ptr; - for (offs = 0; offs < src_len; ++offs) { - const UINTVAL c = iter.get_and_advance(interp, &iter); + while (iter.charpos < src_len) { + const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); if (c >= 0x100) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION, "lossy conversion to iso-8559-1"); - ENCODING_SET_BYTE(interp, dest, offs, c); + Parrot_fixed_8_encoding_ptr->set_byte(interp, dest, iter.charpos - 1, c); } + dest->charset = Parrot_iso_8859_1_charset_ptr; + dest->encoding = Parrot_fixed_8_encoding_ptr; return dest; } @@ -221,18 +221,18 @@ to_unicode(PARROT_INTERP, ARGIN(const STRING *src)) dest->charset = Parrot_unicode_charset_ptr; dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interp, dest); Parrot_gc_reallocate_string_storage(interp, dest, src->strlen); - ENCODING_ITER_INIT(interp, dest, &iter); - for (offs = 0; offs < src->strlen; ++offs) { - const UINTVAL c = ENCODING_GET_BYTE(interp, src, offs); + STRING_ITER_INIT(interp, &iter); + while (iter.charpos < src->strlen) { + const UINTVAL c = ENCODING_GET_BYTE(interp, src, iter.charpos); if (iter.bytepos >= Buffer_buflen(dest) - 4) { - UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5); + UINTVAL need = (UINTVAL)((src->strlen - iter.charpos) * 1.5); if (need < 16) need = 16; Parrot_gc_reallocate_string_storage(interp, dest, Buffer_buflen(dest) + need); } - iter.set_and_advance(interp, &iter, c); + STRING_ITER_SET_AND_ADVANCE(interp, dest, &iter, c); } dest->bufused = iter.bytepos; dest->strlen = iter.charpos; diff --git a/src/string/charset/unicode.c b/src/string/charset/unicode.c index e1de74c..03555e2 100644 --- a/src/string/charset/unicode.c +++ b/src/string/charset/unicode.c @@ -651,20 +651,20 @@ compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs)) { ASSERT_ARGS(compare) String_iter l_iter, r_iter; - UINTVAL offs, cl, cr, min_len, l_len, r_len; + UINTVAL min_len, l_len, r_len; /* TODO make optimized equal - strings are equal length then already */ - ENCODING_ITER_INIT(interp, lhs, &l_iter); - ENCODING_ITER_INIT(interp, rhs, &r_iter); + STRING_ITER_INIT(interp, &l_iter); + STRING_ITER_INIT(interp, &r_iter); l_len = lhs->strlen; r_len = rhs->strlen; min_len = l_len > r_len ? r_len : l_len; - for (offs = 0; offs < min_len; ++offs) { - cl = l_iter.get_and_advance(interp, &l_iter); - cr = r_iter.get_and_advance(interp, &r_iter); + while (l_iter.charpos < min_len) { + const UINTVAL cl = STRING_ITER_GET_AND_ADVANCE(interp, lhs, &l_iter); + const UINTVAL cr = STRING_ITER_GET_AND_ADVANCE(interp, rhs, &r_iter); if (cl != cr) return cl < cr ? -1 : 1; @@ -716,13 +716,12 @@ static UINTVAL validate(PARROT_INTERP, ARGIN(const STRING *src)) { ASSERT_ARGS(validate) - INTVAL offset; String_iter iter; const INTVAL length = Parrot_str_length(interp, src); - ENCODING_ITER_INIT(interp, src, &iter); - for (offset = 0; offset < length; ++offset) { - const UINTVAL codepoint = iter.get_and_advance(interp, &iter); + STRING_ITER_INIT(interp, &iter); + while (iter.charpos < length) { + const UINTVAL codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); /* Check for Unicode non-characters */ if (codepoint >= 0xfdd0 && (codepoint <= 0xfdef || (codepoint & 0xfffe) == 0xfffe) @@ -877,24 +876,22 @@ find_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offse ASSERT_ARGS(find_cclass) String_iter iter; UINTVAL codepoint; - UINTVAL pos = offset; UINTVAL end = offset + count; - ENCODING_ITER_INIT(interp, src, &iter); - - iter.set_position(interp, &iter, pos); + STRING_ITER_INIT(interp, &iter); + STRING_ITER_SET_POSITION(interp, src, &iter, offset); end = src->strlen < end ? src->strlen : end; - for (; pos < end; ++pos) { - codepoint = iter.get_and_advance(interp, &iter); + while (iter.charpos < end) { + codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); if (codepoint >= 256) { if (u_iscclass(interp, codepoint, flags)) - return pos; + return iter.charpos - 1; } else { if (Parrot_iso_8859_1_typetable[codepoint] & flags) - return pos; + return iter.charpos - 1; } } @@ -920,37 +917,36 @@ find_not_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), ASSERT_ARGS(find_not_cclass) String_iter iter; UINTVAL codepoint; - UINTVAL pos = offset; UINTVAL end = offset + count; int bit; - if (pos > src->strlen) { + if (offset > src->strlen) { /* XXX: Throw in this case? */ return offset + count; } - ENCODING_ITER_INIT(interp, src, &iter); + STRING_ITER_INIT(interp, &iter); - if (pos) - iter.set_position(interp, &iter, pos); + if (offset) + STRING_ITER_SET_POSITION(interp, src, &iter, offset); end = src->strlen < end ? src->strlen : end; if (flags == enum_cclass_any) return end; - for (; pos < end; ++pos) { - codepoint = iter.get_and_advance(interp, &iter); + while (iter.charpos < end) { + codepoint = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); if (codepoint >= 256) { for (bit = enum_cclass_uppercase; bit <= enum_cclass_word ; bit <<= 1) { if ((bit & flags) && !u_iscclass(interp, codepoint, bit)) - return pos; + return iter.charpos - 1; } } else { if (!(Parrot_iso_8859_1_typetable[codepoint] & flags)) - return pos; + return iter.charpos - 1; } } @@ -978,8 +974,8 @@ string_from_codepoint(PARROT_INTERP, UINTVAL codepoint) dest->strlen = 1; - ENCODING_ITER_INIT(interp, dest, &iter); - iter.set_and_advance(interp, &iter, codepoint); + STRING_ITER_INIT(interp, &iter); + STRING_ITER_SET_AND_ADVANCE(interp, dest, &iter, codepoint); dest->bufused = iter.bytepos; return dest; @@ -1002,13 +998,12 @@ compute_hash(PARROT_INTERP, ARGIN(const STRING *src), size_t seed) { ASSERT_ARGS(compute_hash) String_iter iter; - UINTVAL offs; size_t hashval = seed; - ENCODING_ITER_INIT(interp, src, &iter); + STRING_ITER_INIT(interp, &iter); - for (offs = 0; offs < src->strlen; ++offs) { - const UINTVAL c = iter.get_and_advance(interp, &iter); + while (iter.charpos < src->strlen) { + const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, src, &iter); hashval += hashval << 5; hashval += c; } diff --git a/src/string/encoding/fixed_8.c b/src/string/encoding/fixed_8.c index 52008a5..ec51147 100644 --- a/src/string/encoding/fixed_8.c +++ b/src/string/encoding/fixed_8.c @@ -46,6 +46,48 @@ static UINTVAL fixed8_get_next(PARROT_INTERP, ARGMOD(String_iter *iter)) __attribute__nonnull__(2) FUNC_MODIFIES(*iter); +static UINTVAL fixed8_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), + ARGIN(const String_iter *iter), + INTVAL offset) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3); + +static UINTVAL fixed8_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *iter)) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*iter); + +static void fixed8_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), + ARGMOD(String_iter *iter), + UINTVAL c) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*str) + FUNC_MODIFIES(*iter); + +static void fixed8_iter_set_position(SHIM_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *iter), + UINTVAL pos) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*iter); + +static void fixed8_iter_skip(SHIM_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *iter), + INTVAL skip) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*iter); + static void fixed8_set_next(PARROT_INTERP, ARGMOD(String_iter *iter), UINTVAL c) @@ -125,6 +167,24 @@ static STRING * to_encoding(PARROT_INTERP, SHIM(const STRING *src)) #define ASSERT_ARGS_fixed8_get_next __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(iter)) +#define ASSERT_ARGS_fixed8_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(iter)) +#define ASSERT_ARGS_fixed8_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(iter)) +#define ASSERT_ARGS_fixed8_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(iter)) +#define ASSERT_ARGS_fixed8_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(iter)) +#define ASSERT_ARGS_fixed8_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(iter)) #define ASSERT_ARGS_fixed8_set_next __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(iter)) @@ -375,6 +435,108 @@ bytes(SHIM_INTERP, ARGIN(const STRING *src)) /* +=item C + +Get the character at C plus C. + +=cut + +*/ + +static UINTVAL +fixed8_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), ARGIN(const String_iter *iter), INTVAL offset) +{ + ASSERT_ARGS(fixed8_iter_get) + return get_byte(interp, str, iter->charpos + offset); +} + +/* + +=item C + +Moves the string iterator C by C characters. + +=cut + +*/ + +static void +fixed8_iter_skip(SHIM_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *iter), INTVAL skip) +{ + ASSERT_ARGS(fixed8_iter_skip) + iter->bytepos += skip; + iter->charpos += skip; + PARROT_ASSERT(iter->bytepos <= Buffer_buflen(str)); +} + +/* + +=item C + +Moves the string iterator C to the next codepoint. + +=cut + +*/ + +static UINTVAL +fixed8_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *iter)) +{ + ASSERT_ARGS(fixed8_iter_get_and_advance) + const UINTVAL c = get_byte(interp, str, iter->charpos++); + iter->bytepos++; + return c; +} + +/* + +=item C + +With the string iterator C, appends the codepoint C and advances to the +next position in the string. + +=cut + +*/ + +static void +fixed8_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), ARGMOD(String_iter *iter), UINTVAL c) +{ + ASSERT_ARGS(fixed8_iter_set_and_advance) + set_byte(interp, str, iter->charpos++, c); + iter->bytepos++; +} + +/* + +=item C + +Moves the string iterator C to the position C in the string. + +=cut + +*/ + +static void +fixed8_iter_set_position(SHIM_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *iter), UINTVAL pos) +{ + ASSERT_ARGS(fixed8_iter_set_position) + iter->bytepos = iter->charpos = pos; + PARROT_ASSERT(pos <= Buffer_buflen(str)); +} + +/* + =item C Moves the string iterator C to the next codepoint. @@ -511,7 +673,12 @@ Parrot_encoding_fixed_8_init(PARROT_INTERP) bytes, iter_init, find_cclass, - fixed_8_hash + fixed_8_hash, + fixed8_iter_get, + fixed8_iter_skip, + fixed8_iter_get_and_advance, + fixed8_iter_set_and_advance, + fixed8_iter_set_position }; STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding); diff --git a/src/string/encoding/ucs2.c b/src/string/encoding/ucs2.c index 1c9cd48..6e2ec93 100644 --- a/src/string/encoding/ucs2.c +++ b/src/string/encoding/ucs2.c @@ -120,6 +120,50 @@ static size_t ucs2_hash(PARROT_INTERP, __attribute__nonnull__(1) __attribute__nonnull__(2); +static UINTVAL ucs2_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), + ARGIN(const String_iter *i), + INTVAL offset) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3); + +static UINTVAL ucs2_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i)) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void ucs2_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), + ARGMOD(String_iter *i), + UINTVAL c) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*str) + FUNC_MODIFIES(*i); + +static void ucs2_iter_set_position(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + UINTVAL n) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void ucs2_iter_skip(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + INTVAL skip) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + static void ucs2_set_position(SHIM_INTERP, ARGMOD(String_iter *i), UINTVAL n) @@ -161,6 +205,26 @@ static void ucs2_set_position(SHIM_INTERP, #define ASSERT_ARGS_ucs2_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(s)) +#define ASSERT_ARGS_ucs2_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_ucs2_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_ucs2_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_ucs2_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_ucs2_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) #define ASSERT_ARGS_ucs2_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(i)) /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */ @@ -323,11 +387,11 @@ get_codepoints(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL String_iter iter; UINTVAL start; - iter_init(interp, src, &iter); - iter.set_position(interp, &iter, offset); + STRING_ITER_INIT(interp, &iter); + ucs2_iter_set_position(interp, src, &iter, offset); start = iter.bytepos; return_string->strstart = (char *)return_string->strstart + start; - iter.set_position(interp, &iter, offset + count); + ucs2_iter_set_position(interp, src, &iter, offset + count); return_string->bufused = iter.bytepos - start; } #endif @@ -402,6 +466,149 @@ bytes(SHIM_INTERP, ARGIN(const STRING *src)) /* +=item C + +Get the character at C + C. + +=cut + +*/ + +static UINTVAL +ucs2_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset) +{ + ASSERT_ARGS(ucs2_iter_get) + return get_codepoint(interp, str, i->charpos + offset); +} + +/* + +=item C + +Moves the string iterator C by C characters. + +=cut + +*/ + +static void +ucs2_iter_skip(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip) +{ + ASSERT_ARGS(ucs2_iter_skip) + UNUSED(str); + +#if PARROT_HAS_ICU + i->charpos += skip; + i->bytepos += skip * sizeof (UChar); +#else + UNUSED(i); + UNUSED(skip); + no_ICU_lib(interp); +#endif +} + +/* + +=item C + +Moves the string iterator C to the next UCS-2 codepoint. + +=cut + +*/ + +static UINTVAL +ucs2_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i)) +{ + ASSERT_ARGS(ucs2_iter_get_and_advance) + +#if PARROT_HAS_ICU + UChar * const s = (UChar*) str->strstart; + size_t pos = i->bytepos / sizeof (UChar); + + /* TODO either make sure that we don't go past end or use SAFE + * iter versions + */ + const UChar c = s[pos++]; + i->charpos++; + i->bytepos = pos * sizeof (UChar); + return c; +#else + UNUSED(str); + UNUSED(i); + no_ICU_lib(interp); + return (UINTVAL)0; /* Stop the static analyzers from panicing */ +#endif +} + +/* + +=item C + +With the string iterator C, appends the codepoint C and advances to the +next position in the string. + +=cut + +*/ + +static void +ucs2_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c) +{ + ASSERT_ARGS(ucs2_iter_set_and_advance) + +#if PARROT_HAS_ICU + UChar * const s = (UChar*) str->strstart; + UINTVAL pos = i->bytepos / sizeof (UChar); + s[pos++] = (UChar)c; + i->charpos++; + i->bytepos = pos * sizeof (UChar); +#else + UNUSED(str); + UNUSED(i); + UNUSED(c); + no_ICU_lib(interp); +#endif +} + +/* + +=item C + +Moves the string iterator C to the position C in the string. + +=cut + +*/ + +static void +ucs2_iter_set_position(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL n) +{ + ASSERT_ARGS(ucs2_iter_set_position) + UNUSED(str); + +#if PARROT_HAS_ICU + i->charpos = n; + i->bytepos = n * sizeof (UChar); +#else + UNUSED(i); + UNUSED(n); + no_ICU_lib(interp); +#endif +} + +/* + =item C Moves the string iterator C to the next UCS-2 codepoint. @@ -592,7 +799,12 @@ Parrot_encoding_ucs2_init(PARROT_INTERP) bytes, iter_init, find_cclass, - ucs2_hash + ucs2_hash, + ucs2_iter_get, + ucs2_iter_skip, + ucs2_iter_get_and_advance, + ucs2_iter_set_and_advance, + ucs2_iter_set_position }; STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding); Parrot_register_encoding(interp, "ucs2", return_encoding); diff --git a/src/string/encoding/ucs4.c b/src/string/encoding/ucs4.c index c608ef8..e4d0409 100644 --- a/src/string/encoding/ucs4.c +++ b/src/string/encoding/ucs4.c @@ -123,6 +123,50 @@ static size_t ucs4_hash(PARROT_INTERP, __attribute__nonnull__(1) __attribute__nonnull__(2); +static UINTVAL ucs4_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), + ARGIN(const String_iter *i), + INTVAL offset) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3); + +static UINTVAL ucs4_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i)) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void ucs4_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), + ARGMOD(String_iter *i), + UINTVAL c) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*str) + FUNC_MODIFIES(*i); + +static void ucs4_iter_set_position(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + UINTVAL n) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void ucs4_iter_skip(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + INTVAL skip) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + static void ucs4_set_position(PARROT_INTERP, ARGMOD(String_iter *i), UINTVAL n) @@ -167,6 +211,26 @@ static void ucs4_set_position(PARROT_INTERP, #define ASSERT_ARGS_ucs4_hash __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(s)) +#define ASSERT_ARGS_ucs4_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_ucs4_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_ucs4_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_ucs4_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_ucs4_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) #define ASSERT_ARGS_ucs4_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(i)) @@ -414,6 +478,141 @@ bytes(SHIM_INTERP, ARGIN(const STRING *src)) /* +=item C + +Get the character at C + C. + +=cut + +*/ + +static UINTVAL +ucs4_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset) +{ + ASSERT_ARGS(ucs4_iter_get) + return get_codepoint(interp, str, i->charpos + offset); +} + +/* + +=item C + +Moves the string iterator C by C characters. + +=cut + +*/ + +static void +ucs4_iter_skip(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip) +{ + ASSERT_ARGS(ucs4_iter_skip) + UNUSED(str); + +#if PARROT_HAS_ICU + i->charpos += skip; + i->bytepos += skip * sizeof (UChar32); +#else + UNUSED(i); + UNUSED(skip); + no_ICU_lib(interp); +#endif +} + +/* + +=item C + +Moves the string iterator C to the next codepoint. + +=cut + +*/ + +static UINTVAL +ucs4_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i)) +{ + ASSERT_ARGS(ucs4_iter_get_and_advance) + +#if PARROT_HAS_ICU + const UChar32 * const s = (const UChar32*) str->strstart; + const UChar32 c = s[i->charpos++]; + i->bytepos += sizeof (UChar32); + return c; +#else + UNUSED(str); + UNUSED(i); + no_ICU_lib(interp); + return (UINTVAL)0; /* Stop the static analyzers from panicing */ +#endif +} + +/* + +=item C + +With the string iterator C, appends the codepoint C and advances to the +next position in the string. + +=cut + +*/ + +static void +ucs4_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c) +{ + ASSERT_ARGS(ucs4_iter_set_and_advance) + +#if PARROT_HAS_ICU + UChar32 * const s = (UChar32*) str->strstart; + s[i->charpos++] = (UChar32)c; + i->bytepos += sizeof (UChar32); +#else + UNUSED(str); + UNUSED(i); + UNUSED(c); + no_ICU_lib(interp); +#endif +} + +/* + +=item C + +Moves the string iterator C to the position C in the string. + +=cut + +*/ + +static void +ucs4_iter_set_position(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL n) +{ + ASSERT_ARGS(ucs4_iter_set_position) + UNUSED(str); + +#if PARROT_HAS_ICU + i->charpos = n; + i->bytepos = n * sizeof (UChar32); +#else + UNUSED(i); + UNUSED(n); + no_ICU_lib(interp); +#endif +} + +/* + =item C Moves the string iterator C to the next UCS-4 codepoint. @@ -580,10 +779,15 @@ Parrot_encoding_ucs4_init(PARROT_INTERP) iter_init, find_cclass, #if PARROT_HAS_ICU - ucs4_hash + ucs4_hash, #else - NULL + NULL, #endif + ucs4_iter_get, + ucs4_iter_skip, + ucs4_iter_get_and_advance, + ucs4_iter_set_and_advance, + ucs4_iter_set_position }; STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding); Parrot_register_encoding(interp, "ucs4", return_encoding); diff --git a/src/string/encoding/utf16.c b/src/string/encoding/utf16.c index 0c5ec2d..2570de5 100644 --- a/src/string/encoding/utf16.c +++ b/src/string/encoding/utf16.c @@ -107,6 +107,51 @@ static void utf16_encode_and_advance(SHIM_INTERP, __attribute__nonnull__(2) FUNC_MODIFIES(*i); +static UINTVAL utf16_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), + ARGIN(const String_iter *i), + INTVAL offset) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3); + +PARROT_WARN_UNUSED_RESULT +static UINTVAL utf16_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i)) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void utf16_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), + ARGMOD(String_iter *i), + UINTVAL c) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*str) + FUNC_MODIFIES(*i); + +static void utf16_iter_set_position(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + UINTVAL n) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void utf16_iter_skip(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + INTVAL skip) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + static void utf16_set_position(SHIM_INTERP, ARGMOD(String_iter *i), UINTVAL n) @@ -147,6 +192,26 @@ static void utf16_set_position(SHIM_INTERP, PARROT_ASSERT_ARG(i)) #define ASSERT_ARGS_utf16_encode_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf16_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf16_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf16_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf16_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf16_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) #define ASSERT_ARGS_utf16_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(i)) /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */ @@ -377,19 +442,27 @@ static STRING * get_codepoints(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count) { ASSERT_ARGS(get_codepoints) - String_iter iter; - UINTVAL start; +#if PARROT_HAS_ICU + UINTVAL pos = 0, start; + const UChar * const s = (UChar*) src->strstart; STRING * const return_string = Parrot_str_copy(interp, src); - iter_init(interp, src, &iter); - iter.set_position(interp, &iter, offset); - start = iter.bytepos; - return_string->strstart = (char *)return_string->strstart + start ; - iter.set_position(interp, &iter, offset + count); - return_string->bufused = iter.bytepos - start; + U16_FWD_N_UNSAFE(s, pos, offset); + start = pos * sizeof (UChar); + return_string->strstart = (char *)return_string->strstart + start; + U16_FWD_N_UNSAFE(s, pos, count); + return_string->bufused = pos * sizeof (UChar) - start; return_string->strlen = count; return_string->hashval = 0; return return_string; +#else + UNUSED(src); + UNUSED(offset); + UNUSED(count); + + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR, + "no ICU lib loaded"); +#endif } @@ -432,15 +505,24 @@ static UINTVAL codepoints(PARROT_INTERP, ARGIN(const STRING *src)) { ASSERT_ARGS(codepoints) - String_iter iter; +#if PARROT_HAS_ICU + const UChar * const s = (UChar*) src->strstart; + UINTVAL pos = 0, charpos = 0; /* * this is used to initially calculate src->strlen, * therefore we must scan the whole string */ - iter_init(interp, src, &iter); - while (iter.bytepos < src->bufused) - iter.get_and_advance(interp, &iter); - return iter.charpos; + while (pos * sizeof (UChar) < src->bufused) { + U16_FWD_1_UNSAFE(s, pos); + ++charpos; + } + return charpos; +#else + UNUSED(src); + + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR, + "no ICU lib loaded"); +#endif } /* @@ -461,6 +543,189 @@ bytes(SHIM_INTERP, ARGIN(const STRING *src)) return src->bufused; } +/* + +=item C + +Get the character at C plus C. + +=cut + +*/ + +static UINTVAL +utf16_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset) +{ + ASSERT_ARGS(utf16_iter_get) +#if PARROT_HAS_ICU + const UChar * const s = (UChar*) str->strstart; + UINTVAL c, pos; + + pos = i->bytepos / sizeof (UChar); + if (offset > 0) { + U16_FWD_N_UNSAFE(s, pos, offset); + } + else if (offset < 0) { + U16_BACK_N_UNSAFE(s, pos, -offset); + } + U16_GET_UNSAFE(s, pos, c); + + return c; +#else + UNUSED(str); + UNUSED(i); + UNUSED(offset); + + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR, + "no ICU lib loaded"); +#endif +} + +/* + +=item C + +Moves the string iterator C by C characters. + +=cut + +*/ + +static void +utf16_iter_skip(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip) +{ + ASSERT_ARGS(utf16_iter_skip) +#if PARROT_HAS_ICU + const UChar * const s = (UChar*) str->strstart; + UINTVAL pos = i->bytepos / sizeof (UChar); + + if (skip > 0) { + U16_FWD_N_UNSAFE(s, pos, skip); + } + else if (skip < 0) { + U16_BACK_N_UNSAFE(s, pos, -skip); + } + + i->charpos += skip; + i->bytepos = pos * sizeof (UChar); +#else + UNUSED(str); + UNUSED(i); + UNUSED(skip); + + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR, + "no ICU lib loaded"); +#endif +} + +/* + +=item C + +Moves the string iterator C to the next UTF-16 codepoint. + +=cut + +*/ + +PARROT_WARN_UNUSED_RESULT +static UINTVAL +utf16_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i)) +{ + ASSERT_ARGS(utf16_iter_get_and_advance) +#if PARROT_HAS_ICU + const UChar * const s = (UChar*) str->strstart; + UINTVAL c, pos; + pos = i->bytepos / sizeof (UChar); + /* TODO either make sure that we don't go past end or use SAFE + * iter versions + */ + U16_NEXT_UNSAFE(s, pos, c); + i->charpos++; + i->bytepos = pos * sizeof (UChar); + return c; +#else + UNUSED(str); + UNUSED(i); + + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR, + "no ICU lib loaded"); +#endif +} + +/* + +=item C + +With the string iterator C, appends the codepoint C and advances to the +next position in the string. + +=cut + +*/ + +static void +utf16_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c) +{ + ASSERT_ARGS(utf16_iter_set_and_advance) +#if PARROT_HAS_ICU + UChar * const s = (UChar*) str->strstart; + UINTVAL pos; + pos = i->bytepos / sizeof (UChar); + U16_APPEND_UNSAFE(s, pos, c); + i->charpos++; + i->bytepos = pos * sizeof (UChar); +#else + UNUSED(str); + UNUSED(i); + UNUSED(c); + + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR, + "no ICU lib loaded"); +#endif +} + +/* + +=item C + +Moves the string iterator C to the position C in the string. + +=cut + +*/ + +static void +utf16_iter_set_position(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL n) +{ + ASSERT_ARGS(utf16_iter_set_position) +#if PARROT_HAS_ICU + UChar * const s = (UChar*) str->strstart; + UINTVAL pos; + pos = 0; + U16_FWD_N_UNSAFE(s, pos, n); + i->charpos = n; + i->bytepos = pos * sizeof (UChar); +#else + UNUSED(str); + UNUSED(i); + UNUSED(n); + + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR, + "no ICU lib loaded"); +#endif +} + #if PARROT_HAS_ICU /* @@ -595,7 +860,12 @@ Parrot_encoding_utf16_init(PARROT_INTERP) bytes, iter_init, find_cclass, - NULL + NULL, + utf16_iter_get, + utf16_iter_skip, + utf16_iter_get_and_advance, + utf16_iter_set_and_advance, + utf16_iter_set_position }; STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding); Parrot_register_encoding(interp, "utf16", return_encoding); diff --git a/src/string/encoding/utf8.c b/src/string/encoding/utf8.c index ff17761..811ce76 100644 --- a/src/string/encoding/utf8.c +++ b/src/string/encoding/utf8.c @@ -118,6 +118,48 @@ static void utf8_encode_and_advance(PARROT_INTERP, __attribute__nonnull__(2) FUNC_MODIFIES(*i); +static UINTVAL utf8_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), + ARGIN(const String_iter *i), + INTVAL offset) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3); + +static UINTVAL utf8_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i)) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void utf8_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), + ARGMOD(String_iter *i), + UINTVAL c) + __attribute__nonnull__(1) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*str) + FUNC_MODIFIES(*i); + +static void utf8_iter_set_position(SHIM_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + UINTVAL pos) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + +static void utf8_iter_skip(SHIM_INTERP, + ARGIN(const STRING *str), + ARGMOD(String_iter *i), + INTVAL skip) + __attribute__nonnull__(2) + __attribute__nonnull__(3) + FUNC_MODIFIES(*i); + static void utf8_set_position(SHIM_INTERP, ARGMOD(String_iter *i), UINTVAL pos) @@ -175,6 +217,24 @@ static const void * utf8_skip_forward(ARGIN(const void *ptr), UINTVAL n) #define ASSERT_ARGS_utf8_encode_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(interp) \ , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf8_iter_get __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf8_iter_get_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf8_iter_set_and_advance __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(interp) \ + , PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf8_iter_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) +#define ASSERT_ARGS_utf8_iter_skip __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ + PARROT_ASSERT_ARG(str) \ + , PARROT_ASSERT_ARG(i)) #define ASSERT_ARGS_utf8_set_position __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ PARROT_ASSERT_ARG(i)) #define ASSERT_ARGS_utf8_skip_backward __attribute__unused__ int _ASSERT_ARGS_CHECK = (\ @@ -387,6 +447,194 @@ utf8_skip_backward(ARGIN(const void *ptr), UINTVAL n) /* +=item C + +Get the character at C plus C. + +=cut + +*/ + +static UINTVAL +utf8_iter_get(PARROT_INTERP, + ARGIN(const STRING *str), ARGIN(const String_iter *i), INTVAL offset) +{ + ASSERT_ARGS(utf8_iter_get) + const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos); + + if (offset > 0) { + u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr, offset); + } + else if (offset < 0) { + u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr, -offset); + } + + return utf8_decode(interp, u8ptr); +} + +/* + +=item C + +Moves the string iterator C by C characters. + +=cut + +*/ + +static void +utf8_iter_skip(SHIM_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), INTVAL skip) +{ + ASSERT_ARGS(utf8_iter_skip) + const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos); + + if (skip > 0) { + u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr, skip); + } + else if (skip < 0) { + u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr, -skip); + } + + i->charpos += skip; + i->bytepos = (const char *)u8ptr - (const char *)str->strstart; +} + +/* + +=item C + +The UTF-8 implementation of the string iterator's C +function. + +=cut + +*/ + +static UINTVAL +utf8_iter_get_and_advance(PARROT_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i)) +{ + ASSERT_ARGS(utf8_iter_get_and_advance) + const utf8_t *u8ptr = (utf8_t *)((char *)str->strstart + i->bytepos); + UINTVAL c = *u8ptr; + + if (UTF8_IS_START(c)) { + UINTVAL len = UTF8SKIP(u8ptr); + + c &= UTF8_START_MASK(len); + i->bytepos += len; + for (len--; len; len--) { + u8ptr++; + + if (!UTF8_IS_CONTINUATION(*u8ptr)) + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, + "Malformed UTF-8 string\n"); + c = UTF8_ACCUMULATE(c, *u8ptr); + } + + if (UNICODE_IS_SURROGATE(c)) + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, + "Surrogate in UTF-8 string\n"); + } + else if (!UNICODE_IS_INVARIANT(c)) { + Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, + "Malformed UTF-8 string\n"); + } + else { + i->bytepos++; + } + + i->charpos++; + return c; +} + +/* + +=item C + +The UTF-8 implementation of the string iterator's C +function. + +=cut + +*/ + +static void +utf8_iter_set_and_advance(PARROT_INTERP, + ARGMOD(STRING *str), ARGMOD(String_iter *i), UINTVAL c) +{ + ASSERT_ARGS(utf8_iter_set_and_advance) + unsigned char * const pos = (unsigned char *)str->strstart + i->bytepos; + unsigned char * const new_pos = (unsigned char *)utf8_encode(interp, pos, c); + + i->bytepos += (new_pos - pos); + /* XXX possible buffer overrun exception? */ + PARROT_ASSERT(i->bytepos <= Buffer_buflen(str)); + i->charpos++; +} + +/* + +=item C + +The UTF-8 implementation of the string iterator's C +function. + +=cut + +*/ + +static void +utf8_iter_set_position(SHIM_INTERP, + ARGIN(const STRING *str), ARGMOD(String_iter *i), UINTVAL pos) +{ + ASSERT_ARGS(utf8_iter_set_position) + const utf8_t *u8ptr = (const utf8_t *)str->strstart; + + if (pos == 0) { + i->charpos = 0; + i->bytepos = 0; + return; + } + + /* + * we know the byte offsets of three positions: start, current and end + * now find the shortest way to reach pos + */ + if (pos < i->charpos) { + if (pos <= (i->charpos >> 1)) { + /* go forward from start */ + u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr, pos); + } + else { + /* go backward from current */ + u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr + i->bytepos, i->charpos - pos); + } + } + else { + const UINTVAL len = str->strlen; + if (pos <= i->charpos + ((len - i->charpos) >> 1)) { + /* go forward from current */ + u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr + i->bytepos, pos - i->charpos); + } + else { + /* go backward from end */ + u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr + str->bufused, len - pos); + } + } + + i->charpos = pos; + i->bytepos = (const char *)u8ptr - (const char *)str->strstart; +} + +/* + =item C The UTF-8 implementation of the string iterator's C @@ -513,8 +761,8 @@ to_encoding(PARROT_INTERP, ARGIN(const STRING *src)) { ASSERT_ARGS(to_encoding) STRING *result; - String_iter src_iter; - UINTVAL offs, dest_len, dest_pos, src_len; + const ENCODING *src_encoding; + UINTVAL dest_len, dest_pos, src_len; unsigned char *p; if (src->encoding == Parrot_utf8_encoding_ptr) @@ -523,8 +771,8 @@ to_encoding(PARROT_INTERP, ARGIN(const STRING *src)) result = Parrot_gc_new_string_header(interp, 0); src_len = src->strlen; - /* init iter before possilby changing encoding */ - ENCODING_ITER_INIT(interp, src, &src_iter); + /* save source encoding before possibly changing it */ + src_encoding = src->encoding; result->charset = Parrot_unicode_charset_ptr; result->encoding = Parrot_utf8_encoding_ptr; result->strlen = src_len; @@ -542,15 +790,17 @@ to_encoding(PARROT_INTERP, ARGIN(const STRING *src)) result->bufused = dest_len; } else { + String_iter src_iter; + STRING_ITER_INIT(interp, &src_iter); dest_len = src_len; dest_pos = 0; - for (offs = 0; offs < src_len; ++offs) { - const UINTVAL c = src_iter.get_and_advance(interp, &src_iter); + while (src_iter.charpos < src_len) { + const UINTVAL c = src_encoding->iter_get_and_advance(interp, src, &src_iter); unsigned char *new_pos; unsigned char *pos; if (dest_len - dest_pos < 6) { - UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5); + UINTVAL need = (UINTVAL)((src->strlen - src_iter.charpos + 1) * 1.5); if (need < 16) need = 16; dest_len += need; @@ -683,16 +933,16 @@ get_codepoints(PARROT_INTERP, ARGIN(const STRING *src), UINTVAL offset, UINTVAL String_iter iter; UINTVAL start; - iter_init(interp, src, &iter); + STRING_ITER_INIT(interp, &iter); if (offset) - iter.set_position(interp, &iter, offset); + utf8_iter_set_position(interp, src, &iter, offset); start = iter.bytepos; return_string->strstart = (char *)return_string->strstart + start; if (count) - iter.set_position(interp, &iter, offset + count); + utf8_iter_set_position(interp, src, &iter, offset + count); return_string->bufused = iter.bytepos - start; return_string->strlen = count; @@ -749,9 +999,9 @@ codepoints(PARROT_INTERP, ARGIN(const STRING *src)) * this is used to initially calculate src->strlen, * therefore we must scan the whole string */ - iter_init(interp, src, &iter); + STRING_ITER_INIT(interp, &iter); while (iter.bytepos < src->bufused) - iter.get_and_advance(interp, &iter); + utf8_iter_get_and_advance(interp, src, &iter); return iter.charpos; } @@ -825,7 +1075,12 @@ Parrot_encoding_utf8_init(PARROT_INTERP) bytes, iter_init, find_cclass, - NULL + NULL, + utf8_iter_get, + utf8_iter_skip, + utf8_iter_get_and_advance, + utf8_iter_set_and_advance, + utf8_iter_set_position }; STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding); Parrot_register_encoding(interp, "utf8", return_encoding);