php-internal-docs 8.4.8
Unofficial docs for php/php-src
Loading...
Searching...
No Matches
normalizer_normalize.c
Go to the documentation of this file.
1/*
2 +----------------------------------------------------------------------+
3 | This source file is subject to version 3.01 of the PHP license, |
4 | that is bundled with this package in the file LICENSE, and is |
5 | available through the world-wide-web at the following url: |
6 | https://www.php.net/license/3_01.txt |
7 | If you did not receive a copy of the PHP license and are unable to |
8 | obtain it through the world-wide-web, please send a note to |
9 | license@php.net so we can mail you a copy immediately. |
10 +----------------------------------------------------------------------+
11 | Authors: Ed Batutis <ed@batutis.com> |
12 +----------------------------------------------------------------------+
13 */
14
15#ifdef HAVE_CONFIG_H
16#include <config.h>
17#endif
18
19#include "php_intl.h"
20#if U_ICU_VERSION_MAJOR_NUM < 56
21#include "unicode/unorm.h"
22#else
23#include <unicode/unorm2.h>
24#endif
25#include "normalizer.h"
26#include "normalizer_class.h"
27#include "intl_convert.h"
28#include <unicode/utf8.h>
29
30
31#if U_ICU_VERSION_MAJOR_NUM >= 56
32static const UNormalizer2 *intl_get_normalizer(zend_long form, UErrorCode *err)
33{/*{{{*/
34 switch (form)
35 {
37 return unorm2_getNFCInstance(err);
38 break;
40 return unorm2_getNFDInstance(err);
41 break;
43 return unorm2_getNFKCInstance(err);
44 break;
46 return unorm2_getNFKDInstance(err);
47 break;
48 case NORMALIZER_FORM_KC_CF:
49 return unorm2_getNFKCCasefoldInstance(err);
50 break;
51 }
52
54 return NULL;
55}/*}}}*/
56
57static int32_t intl_normalize(zend_long form, const UChar *src, int32_t src_len, UChar *dst, int32_t dst_len, UErrorCode *err)
58{/*{{{*/
59 const UNormalizer2 *norm = intl_get_normalizer(form, err);
60 if (U_FAILURE(*err)) {
61 return -1;
62 }
63
64 return unorm2_normalize(norm, src, src_len, dst, dst_len, err);
65}/*}}}*/
66
67static UBool intl_is_normalized(zend_long form, const UChar *uinput, int32_t uinput_len, UErrorCode *err)
68{/*{{{*/
69 const UNormalizer2 *norm = intl_get_normalizer(form, err);
70
71 if(U_FAILURE(*err)) {
72 return false;
73 }
74
75 return unorm2_isNormalized(norm, uinput, uinput_len, err);
76}/*}}}*/
77#endif
78
79/* {{{ Normalize a string. */
81{
82 char* input = NULL;
83 /* form is optional, defaults to FORM_C */
85 size_t input_len = 0;
86
87 UChar* uinput = NULL;
88 int32_t uinput_len = 0;
89 int expansion_factor = 1;
90 UErrorCode status = U_ZERO_ERROR;
91
92 UChar* uret_buf = NULL;
93 int32_t uret_len = 0;
94
95 zend_string* u8str;
96
97 int32_t size_needed;
98
100
101 /* Parse parameters. */
103 &input, &input_len, &form ) == FAILURE )
104 {
106 }
107
108 expansion_factor = 1;
109
110 switch(form) {
112 expansion_factor = 3;
113 break;
115 expansion_factor = 3;
116 break;
119#if U_ICU_VERSION_MAJOR_NUM >= 56
120 case NORMALIZER_FORM_KC_CF:
121#endif
122 break;
123 default:
124 zend_argument_value_error(2, "must be a a valid normalization form");
126 }
127
128 /*
129 * Normalize string (converting it to UTF-16 first).
130 */
131
132 /* First convert the string to UTF-16. */
133 intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
134
135 if( U_FAILURE( status ) )
136 {
137 /* Set global error code. */
139
140 /* Set error messages. */
141 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
142 if (uinput) {
143 efree( uinput );
144 }
146 }
147
148
149 /* Allocate memory for the destination buffer for normalization */
150 uret_len = uinput_len * expansion_factor;
151 uret_buf = eumalloc( uret_len + 1 );
152
153 /* normalize */
154#if U_ICU_VERSION_MAJOR_NUM < 56
155 size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
156#else
157 size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
158#endif
159
160 /* Bail out if an unexpected error occurred.
161 * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
162 * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
163 */
165 intl_error_set_custom_msg( NULL, "Error normalizing string", 0 );
166 efree( uret_buf );
167 efree( uinput );
169 }
170
171 if ( size_needed > uret_len ) {
172 /* realloc does not seem to work properly - memory is corrupted
173 * uret_buf = eurealloc(uret_buf, size_needed + 1);
174 */
175 efree( uret_buf );
176 uret_buf = eumalloc( size_needed + 1 );
177 uret_len = size_needed;
178
180
181 /* try normalize again */
182#if U_ICU_VERSION_MAJOR_NUM < 56
183 size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
184#else
185 size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
186#endif
187
188 /* Bail out if an unexpected error occurred. */
189 if( U_FAILURE(status) ) {
190 /* Set error messages. */
191 intl_error_set_custom_msg( NULL,"Error normalizing string", 0 );
192 efree( uret_buf );
193 efree( uinput );
195 }
196 }
197
198 efree( uinput );
199
200 /* the buffer we actually used */
201 uret_len = size_needed;
202
203 /* Convert normalized string from UTF-16 to UTF-8. */
204 u8str = intl_convert_utf16_to_utf8(uret_buf, uret_len, &status );
205 efree( uret_buf );
206 if( !u8str )
207 {
209 "normalizer_normalize: error converting normalized text UTF-8", 0 );
211 }
212
213 /* Return it. */
214 RETVAL_NEW_STR( u8str );
215}
216/* }}} */
217
218/* {{{ Test if a string is in a given normalization form. */
220{
221 char* input = NULL;
222 /* form is optional, defaults to FORM_C */
224 size_t input_len = 0;
225
226 UChar* uinput = NULL;
227 int uinput_len = 0;
228 UErrorCode status = U_ZERO_ERROR;
229
230 UBool uret = false;
231
233
234 /* Parse parameters. */
236 &input, &input_len, &form) == FAILURE )
237 {
239 }
240
241 switch(form) {
246#if U_ICU_VERSION_MAJOR_NUM >= 56
247 case NORMALIZER_FORM_KC_CF:
248#endif
249 break;
250 default:
251 zend_argument_value_error(2, "must be a a valid normalization form");
253 }
254
255
256 /*
257 * Test normalization of string (converting it to UTF-16 first).
258 */
259
260 /* First convert the string to UTF-16. */
261 intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
262
263 if( U_FAILURE( status ) )
264 {
265 /* Set global error code. */
267
268 /* Set error messages. */
269 intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 0 );
270 if (uinput) {
271 efree( uinput );
272 }
274 }
275
276
277 /* test string */
278#if U_ICU_VERSION_MAJOR_NUM < 56
279 uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
280#else
281 uret = intl_is_normalized(form, uinput, uinput_len, &status);
282#endif
283
284 efree( uinput );
285
286 /* Bail out if an unexpected error occurred. */
287 if( U_FAILURE(status) ) {
288 /* Set error messages. */
289 intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 0 );
291 }
292
293 if ( uret )
295
297}
298/* }}} */
299
300/* {{{ Returns the Decomposition_Mapping property for the given UTF-8 encoded code point. */
301#if U_ICU_VERSION_MAJOR_NUM >= 56
303{
304 char* input = NULL;
305 size_t input_length = 0;
306
307 UChar32 codepoint = -1;
308 int32_t offset = 0;
309
310 UErrorCode status = U_ZERO_ERROR;
311 const UNormalizer2 *norm;
312 UChar decomposition[32];
313 int32_t decomposition_length;
314
316
318
320 Z_PARAM_STRING(input, input_length)
322 Z_PARAM_LONG(form)
324
325 norm = intl_get_normalizer(form, &status);
326
327 U8_NEXT(input, offset, input_length, codepoint);
328 if ((size_t)offset != input_length) {
330 intl_error_set_custom_msg(NULL, "Input string must be exactly one UTF-8 encoded code point long.", 0);
331 return;
332 }
333
334 if ((codepoint < UCHAR_MIN_VALUE) || (codepoint > UCHAR_MAX_VALUE)) {
336 intl_error_set_custom_msg(NULL, "Code point out of range", 0);
337 return;
338 }
339
340 decomposition_length = unorm2_getRawDecomposition(norm, codepoint, decomposition, 32, &status);
341 if (decomposition_length == -1) {
342 RETURN_NULL();
343 }
344
345 RETVAL_NEW_STR(intl_convert_utf16_to_utf8(decomposition, decomposition_length, &status));
346}
347#endif
348/* }}} */
const U_BUFFER_OVERFLOW_ERROR
const U_STRING_NOT_TERMINATED_WARNING
const U_ILLEGAL_ARGUMENT_ERROR
const U_ZERO_ERROR
DNS_STATUS status
Definition dns_win32.c:49
char * err
Definition ffi.c:3029
zend_long offset
#define NULL
Definition gdcache.h:45
foreach($dp as $el) foreach( $dp as $el) if( $pass2< 2) echo ""
#define eumalloc(size)
Definition intl_common.h:31
void intl_convert_utf8_to_utf16(UChar **target, int32_t *target_len, const char *src, size_t src_len, UErrorCode *status)
zend_string * intl_convert_utf16_to_utf8(const UChar *src, int32_t src_len, UErrorCode *status)
void intl_error_set(intl_error *err, UErrorCode code, const char *msg, int copyMsg)
Definition intl_error.c:161
void intl_error_reset(intl_error *err)
Definition intl_error.c:78
void intl_error_set_code(intl_error *err, UErrorCode err_code)
Definition intl_error.c:141
void intl_error_set_custom_msg(intl_error *err, const char *msg, int copyMsg)
Definition intl_error.c:90
#define NORMALIZER_FORM_C
Definition normalizer.h:27
#define NORMALIZER_FORM_D
Definition normalizer.h:23
#define NORMALIZER_DEFAULT
Definition normalizer.h:31
#define NORMALIZER_FORM_KC
Definition normalizer.h:29
#define NORMALIZER_FORM_KD
Definition normalizer.h:25
#define PHP_FUNCTION
Definition php.h:364
normalizer_get_raw_decomposition(string $string, int $form=Normalizer::FORM_C)
normalizer_normalize(string $string, int $form=Normalizer::FORM_C)
normalizer_is_normalized(string $string, int $form=Normalizer::FORM_C)
ZEND_API zend_result zend_parse_method_parameters(uint32_t num_args, zval *this_ptr, const char *type_spec,...)
Definition zend_API.c:1314
ZEND_API ZEND_COLD void zend_argument_value_error(uint32_t arg_num, const char *format,...)
Definition zend_API.c:433
#define ZEND_NUM_ARGS()
Definition zend_API.h:530
#define ZEND_PARSE_PARAMETERS_END()
Definition zend_API.h:1641
#define RETURN_FALSE
Definition zend_API.h:1058
#define RETURN_NULL()
Definition zend_API.h:1036
#define RETVAL_NEW_STR(s)
Definition zend_API.h:1015
#define Z_PARAM_OPTIONAL
Definition zend_API.h:1667
#define Z_PARAM_STRING(dest, dest_len)
Definition zend_API.h:2071
#define ZEND_PARSE_PARAMETERS_START(min_num_args, max_num_args)
Definition zend_API.h:1620
#define Z_PARAM_LONG(dest)
Definition zend_API.h:1896
#define RETURN_THROWS()
Definition zend_API.h:1060
#define getThis()
Definition zend_API.h:526
#define RETURN_TRUE
Definition zend_API.h:1059
#define efree(ptr)
Definition zend_alloc.h:155
int32_t zend_long
Definition zend_long.h:42
struct _zend_string zend_string
@ FAILURE
Definition zend_types.h:61