php-internal-docs 8.4.8
Unofficial docs for php/php-src
Loading...
Searching...
No Matches
grapheme_util.c
Go to the documentation of this file.
1/*
2 +----------------------------------------------------------------------+
3 | This source file is subject to version 3.01 of the PHP license, |
4 | that is bundled with this package in the file LICENSE, and is |
5 | available through the world-wide-web at the following url: |
6 | https://www.php.net/license/3_01.txt |
7 | If you did not receive a copy of the PHP license and are unable to |
8 | obtain it through the world-wide-web, please send a note to |
9 | license@php.net so we can mail you a copy immediately. |
10 +----------------------------------------------------------------------+
11 | Author: Ed Batutis <ed@batutis.com> |
12 +----------------------------------------------------------------------+
13 */
14
15/* {{{ includes */
16#ifdef HAVE_CONFIG_H
17#include <config.h>
18#endif
19
20#include <php.h>
21#include "grapheme.h"
22#include "grapheme_util.h"
23#include "intl_common.h"
24
25#include <unicode/utypes.h>
26#include <unicode/ucol.h>
27#include <unicode/ustring.h>
28#include <unicode/ubrk.h>
29#include <unicode/usearch.h>
30
32
33/* }}} */
34
35/* {{{ grapheme_close_global_iterator - clean up */
36void
38{
39 UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
40
41 if ( NULL != global_break_iterator ) {
42 ubrk_close(global_break_iterator);
43 }
44}
45/* }}} */
46
47/* {{{ grapheme_substr_ascii f='from' - starting point, l='length' */
48void grapheme_substr_ascii(char *str, size_t str_len, int32_t f, int32_t l, char **sub_str, int32_t *sub_str_len)
49{
50 int32_t str_len2 = (int32_t)str_len; /* in order to avoid signed/unsigned problems */
51 *sub_str = NULL;
52
53 if(str_len > INT32_MAX) {
54 /* We cannot return long strings from ICU functions, so we won't here too */
55 return;
56 }
57
58 /* if "from" position is negative, count start position from the end
59 * of the string
60 */
61 if (f < 0) {
62 f = str_len2 + f;
63 if (f < 0) {
64 f = 0;
65 }
66 } else if (f > str_len2) {
67 f = str_len2;
68 }
69
70 /* if "length" position is negative, set it to the length
71 * needed to stop that many chars from the end of the string
72 */
73 if (l < 0) {
74 l = (str_len2 - f) + l;
75 if (l < 0) {
76 l = 0;
77 }
78 } else if (l > str_len2 - f) {
79 l = str_len2 - f;
80 }
81
82 *sub_str = str + f;
83 *sub_str_len = l;
84}
85/* }}} */
86
87#define STRPOS_CHECK_STATUS(status, error) \
88 if ( U_FAILURE( (status) ) ) { \
89 intl_error_set_code( NULL, (status) ); \
90 intl_error_set_custom_msg( NULL, (error), 0 ); \
91 ret_pos = -1; \
92 goto finish; \
93 }
94
95
96/* {{{ grapheme_strpos_utf16 - strrpos using utf16*/
97int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case, int last)
98{
99 UChar *uhaystack = NULL, *uneedle = NULL;
100 int32_t uhaystack_len = 0, uneedle_len = 0, char_pos, ret_pos, offset_pos = 0;
101 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
102 UBreakIterator* bi = NULL;
103 UErrorCode status;
104 UStringSearch* src = NULL;
105
106 if(puchar_pos) {
107 *puchar_pos = -1;
108 }
109 /* convert the strings to UTF-16. */
110
112 intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, haystack, haystack_len, &status );
113 STRPOS_CHECK_STATUS(status, "Error converting input string to UTF-16");
114
116 intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, needle, needle_len, &status );
117 STRPOS_CHECK_STATUS(status, "Error converting needle string to UTF-16");
118
119 /* get a pointer to the haystack taking into account the offset */
121 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
122 STRPOS_CHECK_STATUS(status, "Failed to get iterator");
124 ubrk_setText(bi, uhaystack, uhaystack_len, &status);
125 STRPOS_CHECK_STATUS(status, "Failed to set up iterator");
126
127 if (uneedle_len == 0) {
128 offset_pos = grapheme_get_haystack_offset(bi, offset);
129 if (offset_pos == -1) {
130 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
131 ret_pos = -1;
132 goto finish;
133 }
134 ret_pos = last && offset >= 0 ? uhaystack_len : offset_pos;
135 goto finish;
136 }
137
139 src = usearch_open(uneedle, uneedle_len, uhaystack, uhaystack_len, "", bi, &status);
140 STRPOS_CHECK_STATUS(status, "Error creating search object");
141
142 if(f_ignore_case) {
143 UCollator *coll = usearch_getCollator(src);
145 ucol_setAttribute(coll, UCOL_STRENGTH, UCOL_SECONDARY, &status);
146 STRPOS_CHECK_STATUS(status, "Error setting collation strength");
147 usearch_reset(src);
148 }
149
150 if(offset != 0) {
151 offset_pos = grapheme_get_haystack_offset(bi, offset);
152 if (offset_pos == -1) {
153 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
154 ret_pos = -1;
155 goto finish;
156 }
158 usearch_setOffset(src, last ? 0 : offset_pos, &status);
159 STRPOS_CHECK_STATUS(status, "Invalid search offset");
160 }
161
162
163 if(last) {
164 if (offset >= 0) {
165 char_pos = usearch_last(src, &status);
166 if(char_pos < offset_pos) {
167 /* last one is beyond our start offset */
168 char_pos = USEARCH_DONE;
169 }
170 } else {
171 /* searching backwards is broken, so we search forwards, albeit it's less efficient */
172 int32_t prev_pos = USEARCH_DONE;
173 do {
174 char_pos = usearch_next(src, &status);
175 if (char_pos == USEARCH_DONE || char_pos > offset_pos) {
176 char_pos = prev_pos;
177 break;
178 }
179 prev_pos = char_pos;
180 } while(1);
181 }
182 } else {
183 char_pos = usearch_next(src, &status);
184 }
185 STRPOS_CHECK_STATUS(status, "Error looking up string");
186 if(char_pos != USEARCH_DONE && ubrk_isBoundary(bi, char_pos)) {
187 ret_pos = grapheme_count_graphemes(bi, uhaystack,char_pos);
188 if(puchar_pos) {
189 *puchar_pos = char_pos;
190 }
191 } else {
192 ret_pos = -1;
193 }
194
195finish:
196 if (uhaystack) {
197 efree( uhaystack );
198 }
199 if (uneedle) {
200 efree( uneedle );
201 }
202 if (bi) {
203 ubrk_close (bi);
204 }
205 if (src) {
206 usearch_close (src);
207 }
208
209 return ret_pos;
210}
211
212/* }}} */
213
214/* {{{ grapheme_ascii_check: ASCII check */
215zend_long grapheme_ascii_check(const unsigned char *day, size_t len)
216{
217 int ret_len = len;
218 while ( len-- ) {
219 if ( *day++ > 0x7f || (*day == '\n' && *(day - 1) == '\r') )
220 return -1;
221 }
222
223 return ret_len;
224}
225
226/* }}} */
227
228/* {{{ grapheme_split_string: find and optionally return grapheme boundaries */
229int32_t grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len )
230{
231 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
232 UErrorCode status = U_ZERO_ERROR;
233 int ret_len, pos;
234 UBreakIterator* bi;
235
236 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
237
238 if( U_FAILURE(status) ) {
239 return -1;
240 }
241
242 ubrk_setText(bi, text, text_length, &status);
243
244 pos = 0;
245
246 for ( ret_len = 0; pos != UBRK_DONE; ) {
247
248 pos = ubrk_next(bi);
249
250 if ( pos != UBRK_DONE ) {
251
252 if ( NULL != boundary_array && ret_len < boundary_array_len ) {
253 boundary_array[ret_len] = pos;
254 }
255
256 ret_len++;
257 }
258 }
259
260 ubrk_close(bi);
261
262 return ret_len;
263}
264/* }}} */
265
266/* {{{ grapheme_count_graphemes */
267int32_t grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len)
268{
269 int ret_len = 0;
270 int pos = 0;
271 UErrorCode status = U_ZERO_ERROR;
272
273 ubrk_setText(bi, string, string_len, &status);
274
275 do {
276
277 pos = ubrk_next(bi);
278
279 if ( UBRK_DONE != pos ) {
280 ret_len++;
281 }
282
283 } while ( UBRK_DONE != pos );
284
285 return ret_len;
286}
287/* }}} */
288
289
290/* {{{ grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */
291int32_t grapheme_get_haystack_offset(UBreakIterator* bi, int32_t offset)
292{
293 int32_t pos;
294 int32_t (*iter_op)(UBreakIterator* bi);
295 int iter_incr;
296
297 if ( 0 == offset ) {
298 return 0;
299 }
300
301 if ( offset < 0 ) {
302 iter_op = ubrk_previous;
303 ubrk_last(bi); /* one past the end */
304 iter_incr = 1;
305 }
306 else {
307 iter_op = ubrk_next;
308 iter_incr = -1;
309 }
310
311 pos = 0;
312
313 while ( pos != UBRK_DONE && offset != 0 ) {
314
315 pos = iter_op(bi);
316
317 if ( UBRK_DONE != pos ) {
318 offset += iter_incr;
319 }
320 }
321
322 if ( offset != 0 ) {
323 return -1;
324 }
325
326 return pos;
327}
328/* }}} */
329
330/* {{{ grapheme_strrpos_ascii: borrowed from the php ext/standard/string.c */
331zend_long grapheme_strrpos_ascii(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset)
332{
333 char *p, *e;
334
335 if (offset >= 0) {
336 p = haystack + offset;
337 e = haystack + haystack_len - needle_len;
338 } else {
339 p = haystack;
340 if (needle_len > (size_t)-offset) {
341 e = haystack + haystack_len - needle_len;
342 } else {
343 e = haystack + haystack_len + offset;
344 }
345 }
346
347 if (needle_len == 1) {
348 /* Single character search can shortcut memcmps */
349 while (e >= p) {
350 if (*e == *needle) {
351 return (e - p + (offset > 0 ? offset : 0));
352 }
353 e--;
354 }
355 return -1;
356 }
357
358 while (e >= p) {
359 if (memcmp(e, needle, needle_len) == 0) {
360 return (e - p + (offset > 0 ? offset : 0));
361 }
362 e--;
363 }
364
365 return -1;
366}
367
368/* }}} */
369
370/* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */
371UBreakIterator* grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status )
372{
373 UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
374
375 if ( NULL == global_break_iterator ) {
376
377 global_break_iterator = ubrk_open(UBRK_CHARACTER,
378 NULL, /* icu default locale - locale has no effect on this iterator */
379 NULL, /* text not set in global iterator */
380 0, /* text length = 0 */
381 status);
382
383 INTL_G(grapheme_iterator) = global_break_iterator;
384 }
385
386#if U_ICU_VERSION_MAJOR_NUM >= 69
387 return ubrk_clone(global_break_iterator, status);
388#else
389 int32_t buffer_size = U_BRK_SAFECLONE_BUFFERSIZE;
390
391 return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status);
392#endif
393}
394/* }}} */
size_t len
Definition apprentice.c:174
const U_ZERO_ERROR
DNS_STATUS status
Definition dns_win32.c:49
zend_long offset
#define NULL
Definition gdcache.h:45
zend_long grapheme_strrpos_ascii(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset)
int32_t grapheme_get_haystack_offset(UBreakIterator *bi, int32_t offset)
#define STRPOS_CHECK_STATUS(status, error)
int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case, int last)
int32_t grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len)
void grapheme_substr_ascii(char *str, size_t str_len, int32_t f, int32_t l, char **sub_str, int32_t *sub_str_len)
int32_t grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len)
void grapheme_close_global_iterator(void)
UBreakIterator * grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status)
zend_long grapheme_ascii_check(const unsigned char *day, size_t len)
void intl_convert_utf8_to_utf16(UChar **target, int32_t *target_len, const char *src, size_t src_len, UErrorCode *status)
unsigned const char * pos
Definition php_ffi.h:52
unsigned const char * text
Definition php_ffi.h:53
UBreakIterator * grapheme_iterator
Definition php_intl.h:50
#define INTL_G(v)
Definition php_intl.h:62
p
Definition session.c:1105
ZEND_API ZEND_COLD void zend_argument_value_error(uint32_t arg_num, const char *format,...)
Definition zend_API.c:433
#define ZEND_EXTERN_MODULE_GLOBALS(module_name)
Definition zend_API.h:270
#define efree(ptr)
Definition zend_alloc.h:155
int32_t zend_long
Definition zend_long.h:42
int last