php-internal-docs 8.4.8
Unofficial docs for php/php-src
Loading...
Searching...
No Matches
php_unicode.c
Go to the documentation of this file.
1/*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Wez Furlong (wez@thebrainroom.com) |
14 +----------------------------------------------------------------------+
15
16 Based on code from ucdata-2.5, which has the following Copyright:
17
18 Copyright 2001 Computing Research Labs, New Mexico State University
19
20 Permission is hereby granted, free of charge, to any person obtaining a
21 copy of this software and associated documentation files (the "Software"),
22 to deal in the Software without restriction, including without limitation
23 the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 and/or sell copies of the Software, and to permit persons to whom the
25 Software is furnished to do so, subject to the following conditions:
26
27 The above copyright notice and this permission notice shall be included in
28 all copies or substantial portions of the Software.
29*/
30
31#include "php.h"
32
33/* include case folding data generated from the official UnicodeData.txt file */
34#include "mbstring.h"
35#include "php_unicode.h"
36#include "unicode_data.h"
37
39
41
42static bool prop_lookup(unsigned long code, unsigned long n)
43{
44 long l = _ucprop_offsets[n];
45 long r = _ucprop_offsets[n + 1] - 1;
46 while (l <= r) {
47 /*
48 * Determine a "mid" point and adjust to make sure the mid point is at
49 * the beginning of a range pair.
50 */
51 long m = (l + r) >> 1;
52 m -= (m & 1);
53 if (code > _ucprop_ranges[m + 1])
54 l = m + 2;
55 else if (code < _ucprop_ranges[m])
56 r = m - 2;
57 else
58 return true;
59 }
60 return false;
61
62}
63
64MBSTRING_API bool php_unicode_is_prop1(unsigned long code, int prop)
65{
66 return prop_lookup(code, prop);
67}
68
69MBSTRING_API bool php_unicode_is_prop(unsigned long code, ...)
70{
71 bool result = false;
72 va_list va;
73 va_start(va, code);
74
75 while (1) {
76 int prop = va_arg(va, int);
77 if (prop < 0) {
78 break;
79 }
80
81 if (prop_lookup(code, prop)) {
82 result = true;
83 break;
84 }
85 }
86
87 va_end(va);
88 return result;
89}
90
91static inline unsigned mph_hash(unsigned d, unsigned x) {
92 x ^= d;
93 x = ((x >> 16) ^ x) * 0x45d9f3b;
94 return x;
95}
96
97#define CODE_NOT_FOUND ((unsigned) -1)
98
99static inline unsigned mph_lookup(
100 unsigned code,
101 const short *g_table, unsigned g_table_size,
102 const unsigned *table, unsigned table_size)
103{
104 short g = g_table[mph_hash(0, code) % g_table_size];
105
106 unsigned idx;
107 if (g <= 0) {
108 idx = -g;
109 } else {
110 idx = mph_hash(g, code) % table_size;
111 }
112
113 if (table[2*idx] == code) {
114 return table[2*idx + 1];
115 }
116 return CODE_NOT_FOUND;
117}
118
119#define CASE_LOOKUP(code, type) \
120 mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
121 _uccase_##type##_table, _uccase_##type##_table_size)
122
123static unsigned php_unicode_toupper_raw(unsigned code, const mbfl_encoding *enc)
124{
125 /* After the ASCII characters, the first codepoint with an uppercase version
126 * is 0xB5 (MICRO SIGN) */
127 if (code < 0xB5) {
128 /* Fast path for ASCII */
129 if (code >= 0x61 && code <= 0x7A) {
130 if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x69)) {
131 return 0x130;
132 }
133 return code - 0x20;
134 }
135 return code;
136 } else {
137 unsigned new_code = CASE_LOOKUP(code, upper);
138 if (new_code != CODE_NOT_FOUND) {
139 return new_code;
140 }
141 return code;
142 }
143}
144
145static unsigned php_unicode_tolower_raw(unsigned code, const mbfl_encoding *enc)
146{
147 /* After the ASCII characters, the first codepoint with a lowercase version
148 * is 0xC0 (LATIN CAPITAL LETTER A WITH GRAVE) */
149 if (code < 0xC0) {
150 /* Fast path for ASCII */
151 if (code >= 0x41 && code <= 0x5A) {
152 if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x0049L)) {
153 return 0x0131L;
154 }
155 return code + 0x20;
156 }
157 return code;
158 } else {
159 unsigned new_code = CASE_LOOKUP(code, lower);
160 if (new_code != CODE_NOT_FOUND) {
161 if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x130)) {
162 return 0x69;
163 }
164 return new_code;
165 }
166 return code;
167 }
168}
169
170static unsigned php_unicode_totitle_raw(unsigned code, const mbfl_encoding *enc)
171{
172 unsigned new_code = CASE_LOOKUP(code, title);
173 if (new_code != CODE_NOT_FOUND) {
174 return new_code;
175 }
176
177 /* No dedicated title-case variant, use to-upper instead */
178 return php_unicode_toupper_raw(code, enc);
179}
180
181static unsigned php_unicode_tofold_raw(unsigned code, const mbfl_encoding *enc)
182{
183 if (code < 0x80) {
184 /* Fast path for ASCII */
185 if (code >= 0x41 && code <= 0x5A) {
186 if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x49)) {
187 return 0x131;
188 }
189 return code + 0x20;
190 }
191 return code;
192 } else {
193 unsigned new_code = CASE_LOOKUP(code, fold);
194 if (new_code != CODE_NOT_FOUND) {
195 if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x130)) {
196 return 0x69;
197 }
198 return new_code;
199 }
200 return code;
201 }
202}
203
204static inline unsigned php_unicode_tolower_simple(unsigned code, const mbfl_encoding *enc) {
205 code = php_unicode_tolower_raw(code, enc);
206 if (UNEXPECTED(code > 0xffffff)) {
207 return _uccase_extra_table[code & 0xffffff];
208 }
209 return code;
210}
211static inline unsigned php_unicode_toupper_simple(unsigned code, const mbfl_encoding *enc) {
212 code = php_unicode_toupper_raw(code, enc);
213 if (UNEXPECTED(code > 0xffffff)) {
214 return _uccase_extra_table[code & 0xffffff];
215 }
216 return code;
217}
218static inline unsigned php_unicode_totitle_simple(unsigned code, const mbfl_encoding *enc) {
219 code = php_unicode_totitle_raw(code, enc);
220 if (UNEXPECTED(code > 0xffffff)) {
221 return _uccase_extra_table[code & 0xffffff];
222 }
223 return code;
224}
225static inline unsigned php_unicode_tofold_simple(unsigned code, const mbfl_encoding *enc) {
226 code = php_unicode_tofold_raw(code, enc);
227 if (UNEXPECTED(code > 0xffffff)) {
228 return _uccase_extra_table[code & 0xffffff];
229 }
230 return code;
231}
232
233static uint32_t *emit_special_casing_sequence(uint32_t w, uint32_t *out)
234{
235 unsigned int len = w >> 24;
236 const unsigned int *p = &_uccase_extra_table[w & 0xFFFFFF];
237 while (len--) {
238 *out++ = *++p;
239 }
240 return out;
241}
242
243/* Used when determining whether special casing rules should be applied to Greek letter sigma */
244static bool scan_ahead_for_cased_letter(unsigned char *in, size_t in_len, unsigned int state, const mbfl_encoding *encoding)
245{
246 uint32_t wchar_buf[64];
247
248 while (in_len) {
249 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 64, &state);
250 ZEND_ASSERT(out_len <= 64);
251 for (unsigned int i = 0; i < out_len; i++) {
252 uint32_t w = wchar_buf[i];
253 if (php_unicode_is_cased(w)) {
254 return true;
255 }
257 return false;
258 }
259 }
260 }
261
262 return false;
263}
264
265/* Used when determining whether special casing rules should be applied to Greek letter sigma */
266static bool scan_back_for_cased_letter(uint32_t *begin, uint32_t *end)
267{
268 if (end != NULL) {
269 while (--end >= begin) {
270 uint32_t w = *end;
271 if (php_unicode_is_cased(w)) {
272 return true;
273 }
275 return false;
276 }
277 }
278 }
279 return false;
280}
281
282MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar)
283{
284 /* A Unicode codepoint can expand out to up to 3 codepoints when uppercased, lowercased, or title cased
285 * See http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt */
286 uint32_t wchar_buf[64], converted_buf[192];
287 unsigned int state = 0, title_mode = 0;
288 unsigned char *in = (unsigned char*)srcstr;
289 /* In rare cases, we need to scan backwards through the previously converted codepoints to see
290 * if special conversion rules should be used for the Greek letter sigma */
291 uint32_t *converted_end = NULL;
292
294 mb_convert_buf_init(&buf, in_len + 1, illegal_substchar, illegal_mode);
295
296 while (in_len) {
297 size_t out_len = src_encoding->to_wchar(&in, &in_len, wchar_buf, 64, &state);
298 ZEND_ASSERT(out_len <= 64);
299 uint32_t *p = converted_buf;
300
301 /* In all cases, handle invalid characters early, as we assign special meaning to codepoints > 0xFFFFFF */
302 switch (case_mode) {
304 for (size_t i = 0; i < out_len; i++) {
305 uint32_t w = wchar_buf[i];
306 *p++ = (UNEXPECTED(w > 0xFFFFFF)) ? w : php_unicode_toupper_simple(w, src_encoding);
307 }
308 break;
309
311 for (size_t i = 0; i < out_len; i++) {
312 uint32_t w = wchar_buf[i];
313 *p++ = (UNEXPECTED(w > 0xFFFFFF)) ? w : php_unicode_tolower_simple(w, src_encoding);
314 }
315 break;
316
318 for (size_t i = 0; i < out_len; i++) {
319 uint32_t w = wchar_buf[i];
320 *p++ = (UNEXPECTED(w > 0xFFFFFF)) ? w : php_unicode_tofold_simple(w, src_encoding);
321 }
322 break;
323
325 for (size_t i = 0; i < out_len; i++) {
326 uint32_t w = wchar_buf[i];
327 if (UNEXPECTED(w > 0xFFFFFF)) {
328 *p++ = w;
329 continue;
330 }
331 *p++ = title_mode ? php_unicode_tolower_simple(w, src_encoding) : php_unicode_totitle_simple(w, src_encoding);
333 title_mode = php_unicode_is_cased(w);
334 }
335 }
336 break;
337
339 for (size_t i = 0; i < out_len; i++) {
340 uint32_t w = wchar_buf[i];
341 if (UNEXPECTED(w > 0xFFFFFF)) {
342 *p++ = w;
343 continue;
344 }
345 w = php_unicode_toupper_raw(w, src_encoding);
346 if (UNEXPECTED(w > 0xFFFFFF)) {
347 p = emit_special_casing_sequence(w, p);
348 } else {
349 *p++ = w;
350 }
351 }
352 break;
353
355 for (size_t i = 0; i < out_len; i++) {
356 uint32_t w = wchar_buf[i];
357 if (UNEXPECTED(w > 0xFFFFFF)) {
358 *p++ = w;
359 continue;
360 }
361 if (w == 0x3A3) {
362 /* For Greek capital letter sigma, there is a special casing rule;
363 * if it is the last letter in a word, it should be downcased to U+03C2
364 * (GREEK SMALL LETTER FINAL SIGMA)
365 * Specifically, we need to check if this codepoint is preceded by any
366 * number of case-ignorable codepoints, preceded by a cased letter, AND
367 * is NOT followed by any number of case-ignorable codepoints followed
368 * by a cased letter.
369 * Ref: http://www.unicode.org/reports/tr21/tr21-5.html
370 * Ref: https://unicode.org/Public/UNIDATA/SpecialCasing.txt
371 *
372 * While the special casing rules say we should scan backwards through "any number"
373 * of case-ignorable codepoints, that is a great implementation burden
374 * It would basically mean we need to keep all the codepoints in a big buffer
375 * during this conversion operation, but we don't want to do that (to reduce the
376 * amount of temporary scratch memory used)
377 * Hence, we only scan back through the codepoints in wchar_buf, and if we hit the
378 * beginning of the buffer, whatever codepoints have not yet been overwritten in
379 * the latter part of converted_buf */
380 int j = i - 1;
381 while (j >= 0 && php_unicode_is_case_ignorable(wchar_buf[j])) {
382 j--;
383 }
384 if (j >= 0 ? php_unicode_is_cased(wchar_buf[j]) : scan_back_for_cased_letter(p, converted_end)) {
385 /* Now scan ahead to look for a cased letter */
386 j = i + 1;
387 while (j < out_len && php_unicode_is_case_ignorable(wchar_buf[j])) {
388 j++;
389 }
390 /* If we hit the end of wchar_buf, convert more of the input string into
391 * codepoints and continue scanning */
392 if (j >= out_len ? !scan_ahead_for_cased_letter(in, in_len, state, src_encoding) : !php_unicode_is_cased(wchar_buf[j])) {
393 *p++ = 0x3C2;
394 continue;
395 }
396 }
397 }
398 w = php_unicode_tolower_raw(w, src_encoding);
399 if (UNEXPECTED(w > 0xFFFFFF)) {
400 p = emit_special_casing_sequence(w, p);
401 } else {
402 *p++ = w;
403 }
404 }
405 break;
406
408 for (size_t i = 0; i < out_len; i++) {
409 uint32_t w = wchar_buf[i];
410 if (UNEXPECTED(w > 0xFFFFFF)) {
411 *p++ = w;
412 continue;
413 }
414 w = php_unicode_tofold_raw(w, src_encoding);
415 if (UNEXPECTED(w > 0xFFFFFF)) {
416 p = emit_special_casing_sequence(w, p);
417 } else {
418 *p++ = w;
419 }
420 }
421 break;
422
424 for (size_t i = 0; i < out_len; i++) {
425 uint32_t w = wchar_buf[i];
426 if (UNEXPECTED(w > 0xFFFFFF)) {
427 *p++ = w;
428 continue;
429 }
430 uint32_t w2;
431 if (title_mode) {
432 if (w == 0x3A3) {
433 int j = i - 1;
434 while (j >= 0 && php_unicode_is_case_ignorable(wchar_buf[j])) {
435 j--;
436 }
437 if (j >= 0 ? php_unicode_is_cased(wchar_buf[j]) : scan_back_for_cased_letter(p, converted_end)) {
438 j = i + 1;
439 while (j < out_len && php_unicode_is_case_ignorable(wchar_buf[j])) {
440 j++;
441 }
442 if (j >= out_len ? !scan_ahead_for_cased_letter(in, in_len, state, src_encoding) : !php_unicode_is_cased(wchar_buf[j])) {
443 *p++ = 0x3C2;
444 goto set_title_mode;
445 }
446 }
447 }
448 w2 = php_unicode_tolower_raw(w, src_encoding);
449 } else {
450 w2 = php_unicode_totitle_raw(w, src_encoding);
451 }
452 if (UNEXPECTED(w2 > 0xFFFFFF)) {
453 p = emit_special_casing_sequence(w2, p);
454 } else {
455 *p++ = w2;
456 }
457set_title_mode:
459 title_mode = php_unicode_is_cased(w);
460 }
461 }
462 break;
463
465 }
466
467 converted_end = p;
468 ZEND_ASSERT(p - converted_buf <= 192);
469 dst_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
470 }
471
472 return mb_convert_buf_result(&buf, dst_encoding);
473}
size_t len
Definition apprentice.c:174
int begin
Definition eaw_table.h:20
zend_long n
Definition ffi.c:4979
zend_ffi_ctype_name_buf buf
Definition ffi.c:4685
#define NULL
Definition gdcache.h:45
again j
const mbfl_encoding mbfl_encoding_8859_9
#define MBSTRING_API
Definition mbstring.h:39
unsigned const char * end
Definition php_ffi.h:51
xmlCharEncodingHandlerPtr encoding
Definition php_soap.h:170
MBSTRING_API zend_string * php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar)
MBSTRING_API bool php_unicode_is_prop1(unsigned long code, int prop)
Definition php_unicode.c:64
#define CODE_NOT_FOUND
Definition php_unicode.c:97
#define CASE_LOOKUP(code, type)
MBSTRING_API bool php_unicode_is_prop(unsigned long code,...)
Definition php_unicode.c:69
#define php_unicode_is_cased(cc)
#define php_unicode_is_case_ignorable(cc)
php_case_mode
Definition php_unicode.h:80
@ PHP_UNICODE_CASE_TITLE
Definition php_unicode.h:83
@ PHP_UNICODE_CASE_FOLD
Definition php_unicode.h:84
@ PHP_UNICODE_CASE_LOWER_SIMPLE
Definition php_unicode.h:86
@ PHP_UNICODE_CASE_FOLD_SIMPLE
Definition php_unicode.h:88
@ PHP_UNICODE_CASE_LOWER
Definition php_unicode.h:82
@ PHP_UNICODE_CASE_UPPER_SIMPLE
Definition php_unicode.h:85
@ PHP_UNICODE_CASE_TITLE_SIMPLE
Definition php_unicode.h:87
@ PHP_UNICODE_CASE_UPPER
Definition php_unicode.h:81
p
Definition session.c:1105
mb_to_wchar_fn to_wchar
mb_from_wchar_fn from_wchar
#define ZEND_EXTERN_MODULE_GLOBALS(module_name)
Definition zend_API.h:270
struct _zend_string zend_string
#define ZEND_ASSERT(c)
#define EMPTY_SWITCH_DEFAULT_CASE()
#define UNEXPECTED(condition)
bool result
out($f, $s)