php-internal-docs 8.4.8
Unofficial docs for php/php-src
Loading...
Searching...
No Matches
mbstring.c
Go to the documentation of this file.
1/*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Tsukada Takuya <tsukada@fminn.nagano.nagano.jp> |
14 | Rui Hirokawa <hirokawa@php.net> |
15 | Hironori Sato <satoh@jpnnet.com> |
16 | Shigeru Kanemoto <sgk@happysize.co.jp> |
17 +----------------------------------------------------------------------+
18*/
19
20/* {{{ includes */
21#include <limits.h>
22
23#include "libmbfl/config.h"
24#include "php.h"
25#include "php_ini.h"
26#include "php_variables.h"
27#include "mbstring.h"
30#include "ext/standard/exec.h"
31#include "main/php_output.h"
32#include "ext/standard/info.h"
33#include "ext/pcre/php_pcre.h"
34
48
49#include "php_globals.h"
50#include "rfc1867.h"
51#include "php_content_types.h"
52#include "SAPI.h"
53#include "php_unicode.h"
54#include "TSRM.h"
55
56#include "mb_gpc.h"
57
58#ifdef HAVE_MBREGEX
59# include "php_mbregex.h"
60#endif
61
62#include "zend_smart_str.h"
63#include "zend_multibyte.h"
64#include "mbstring_arginfo.h"
65
66#include "rare_cp_bitvec.h"
67
68#ifdef __SSE2__
69#include <emmintrin.h>
70#endif
71
72#ifdef __SSE3__
73#include <immintrin.h>
74#include <pmmintrin.h>
75#endif
76
77/* }}} */
78
79/* {{{ prototypes */
81
82static PHP_GINIT_FUNCTION(mbstring);
83static PHP_GSHUTDOWN_FUNCTION(mbstring);
84
85static void php_mb_populate_current_detect_order_list(void);
86
87static int php_mb_encoding_translation(void);
88
89static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size);
90
91static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding);
92
93static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc);
94
95static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
96
97static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
98
99static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
100
101static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
102
103/* See mbfilter_cp5022x.c */
104uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
105/* }}} */
106
107/* {{{ php_mb_default_identify_list */
113
114static const enum mbfl_no_encoding php_mb_default_identify_list_ja[] = {
120};
121
122static const enum mbfl_no_encoding php_mb_default_identify_list_cn[] = {
127};
128
129static const enum mbfl_no_encoding php_mb_default_identify_list_tw_hk[] = {
134};
135
136static const enum mbfl_no_encoding php_mb_default_identify_list_kr[] = {
141};
142
143static const enum mbfl_no_encoding php_mb_default_identify_list_ru[] = {
149};
150
151static const enum mbfl_no_encoding php_mb_default_identify_list_hy[] = {
155};
156
157static const enum mbfl_no_encoding php_mb_default_identify_list_tr[] = {
162};
163
164static const enum mbfl_no_encoding php_mb_default_identify_list_ua[] = {
168};
169
170static const enum mbfl_no_encoding php_mb_default_identify_list_neut[] = {
173};
174
175
176static const php_mb_nls_ident_list php_mb_default_identify_list[] = {
177 { mbfl_no_language_japanese, php_mb_default_identify_list_ja, sizeof(php_mb_default_identify_list_ja) / sizeof(php_mb_default_identify_list_ja[0]) },
178 { mbfl_no_language_korean, php_mb_default_identify_list_kr, sizeof(php_mb_default_identify_list_kr) / sizeof(php_mb_default_identify_list_kr[0]) },
179 { mbfl_no_language_traditional_chinese, php_mb_default_identify_list_tw_hk, sizeof(php_mb_default_identify_list_tw_hk) / sizeof(php_mb_default_identify_list_tw_hk[0]) },
180 { mbfl_no_language_simplified_chinese, php_mb_default_identify_list_cn, sizeof(php_mb_default_identify_list_cn) / sizeof(php_mb_default_identify_list_cn[0]) },
181 { mbfl_no_language_russian, php_mb_default_identify_list_ru, sizeof(php_mb_default_identify_list_ru) / sizeof(php_mb_default_identify_list_ru[0]) },
182 { mbfl_no_language_armenian, php_mb_default_identify_list_hy, sizeof(php_mb_default_identify_list_hy) / sizeof(php_mb_default_identify_list_hy[0]) },
183 { mbfl_no_language_turkish, php_mb_default_identify_list_tr, sizeof(php_mb_default_identify_list_tr) / sizeof(php_mb_default_identify_list_tr[0]) },
184 { mbfl_no_language_ukrainian, php_mb_default_identify_list_ua, sizeof(php_mb_default_identify_list_ua) / sizeof(php_mb_default_identify_list_ua[0]) },
185 { mbfl_no_language_neutral, php_mb_default_identify_list_neut, sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]) }
186};
187
188/* }}} */
189
190/* {{{ mbstring_deps[] */
191static const zend_module_dep mbstring_deps[] = {
192 ZEND_MOD_REQUIRED("pcre")
194};
195/* }}} */
196
197/* {{{ zend_module_entry mbstring_module_entry */
200 NULL,
201 mbstring_deps,
202 "mbstring",
203 ext_functions,
204 PHP_MINIT(mbstring),
205 PHP_MSHUTDOWN(mbstring),
206 PHP_RINIT(mbstring),
207 PHP_RSHUTDOWN(mbstring),
208 PHP_MINFO(mbstring),
210 PHP_MODULE_GLOBALS(mbstring),
211 PHP_GINIT(mbstring),
212 PHP_GSHUTDOWN(mbstring),
213 NULL,
215};
216/* }}} */
217
218/* {{{ static sapi_post_entry php_post_entries[] */
219static const sapi_post_entry php_post_entries[] = {
220 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_std_post_handler },
221 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
222 { NULL, 0, NULL, NULL }
223};
224/* }}} */
225
226#ifdef COMPILE_DL_MBSTRING
227#ifdef ZTS
229#endif
230ZEND_GET_MODULE(mbstring)
231#endif
232
233/* {{{ static sapi_post_entry mbstr_post_entries[] */
234static const sapi_post_entry mbstr_post_entries[] = {
235 { DEFAULT_POST_CONTENT_TYPE, sizeof(DEFAULT_POST_CONTENT_TYPE)-1, sapi_read_standard_form_data, php_mb_post_handler },
236 { MULTIPART_CONTENT_TYPE, sizeof(MULTIPART_CONTENT_TYPE)-1, NULL, rfc1867_post_handler },
237 { NULL, 0, NULL, NULL }
238};
239/* }}} */
240
241static const mbfl_encoding *php_mb_get_encoding(zend_string *encoding_name, uint32_t arg_num) {
242 if (encoding_name) {
243 const mbfl_encoding *encoding;
244 zend_string *last_encoding_name = MBSTRG(last_used_encoding_name);
245 if (last_encoding_name && (last_encoding_name == encoding_name
246 || zend_string_equals_ci(encoding_name, last_encoding_name))) {
248 }
249
250 encoding = mbfl_name2encoding(ZSTR_VAL(encoding_name));
251 if (!encoding) {
252 zend_argument_value_error(arg_num, "must be a valid encoding, \"%s\" given", ZSTR_VAL(encoding_name));
253 return NULL;
254 } else if (encoding->no_encoding <= mbfl_no_encoding_qprint) {
256 php_error_docref(NULL, E_DEPRECATED, "Handling Base64 via mbstring is deprecated; use base64_encode/base64_decode instead");
257 } else if (encoding == &mbfl_encoding_qprint) {
258 php_error_docref(NULL, E_DEPRECATED, "Handling QPrint via mbstring is deprecated; use quoted_printable_encode/quoted_printable_decode instead");
259 } else if (encoding == &mbfl_encoding_html_ent) {
260 php_error_docref(NULL, E_DEPRECATED, "Handling HTML entities via mbstring is deprecated; use htmlspecialchars, htmlentities, or mb_encode_numericentity/mb_decode_numericentity instead");
261 } else if (encoding == &mbfl_encoding_uuencode) {
262 php_error_docref(NULL, E_DEPRECATED, "Handling Uuencode via mbstring is deprecated; use convert_uuencode/convert_uudecode instead");
263 }
264 }
265
266 if (last_encoding_name) {
267 zend_string_release(last_encoding_name);
268 }
269 MBSTRG(last_used_encoding_name) = zend_string_copy(encoding_name);
271 return encoding;
272 } else {
274 }
275}
276
277static const mbfl_encoding *php_mb_get_encoding_or_pass(const char *encoding_name, size_t encoding_name_len) {
278 if (strncmp(encoding_name, "pass", encoding_name_len) == 0) {
279 return &mbfl_encoding_pass;
280 }
281
282 return mbfl_name2encoding_ex(encoding_name, encoding_name_len);
283}
284
285static size_t count_commas(const char *p, const char *end) {
286 size_t count = 0;
287 while ((p = memchr(p, ',', end - p))) {
288 count++;
289 p++;
290 }
291 return count;
292}
293
294/* {{{ static zend_result php_mb_parse_encoding_list()
295 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
296 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
297 */
298static zend_result php_mb_parse_encoding_list(const char *value, size_t value_length,
299 const mbfl_encoding ***return_list, size_t *return_size, bool persistent, uint32_t arg_num)
300{
301 if (value == NULL || value_length == 0) {
302 *return_list = NULL;
303 *return_size = 0;
304 return SUCCESS;
305 } else {
306 bool included_auto;
307 size_t n, size;
308 const char *p1, *endp, *tmpstr;
309 const mbfl_encoding **entry, **list;
310
311 if (value[0]=='"' && value[value_length-1]=='"' && value_length>2) {
312 tmpstr = value + 1;
313 value_length -= 2;
314 } else {
315 tmpstr = value;
316 }
317
318 endp = tmpstr + value_length;
319 size = 1 + count_commas(tmpstr, endp) + MBSTRG(default_detect_order_list_size);
320 list = (const mbfl_encoding **)pecalloc(size, sizeof(mbfl_encoding*), persistent);
321 entry = list;
322 n = 0;
323 included_auto = 0;
324 p1 = tmpstr;
325 while (1) {
326 const char *comma = memchr(p1, ',', endp - p1);
327 const char *p = comma ? comma : endp;
328 /* trim spaces */
329 while (p1 < p && (*p1 == ' ' || *p1 == '\t')) {
330 p1++;
331 }
332 p--;
333 while (p > p1 && (*p == ' ' || *p == '\t')) {
334 p--;
335 }
336 size_t p1_length = p - p1 + 1;
337 /* convert to the encoding number and check encoding */
338 if (strncasecmp(p1, "auto", p1_length) == 0) {
339 if (!included_auto) {
341 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
342 size_t i;
343 included_auto = 1;
344 for (i = 0; i < identify_list_size; i++) {
345 *entry++ = mbfl_no2encoding(*src++);
346 n++;
347 }
348 }
349 } else {
350 const mbfl_encoding *encoding = mbfl_name2encoding_ex(p1, p1_length);
351 if (!encoding) {
352 /* Called from an INI setting modification */
353 if (arg_num == 0) {
354 php_error_docref("ref.mbstring", E_WARNING, "INI setting contains invalid encoding \"%.*s\"", (int) p1_length, p1);
355 } else {
356 zend_argument_value_error(arg_num, "contains invalid encoding \"%.*s\"", (int) p1_length, p1);
357 }
359 return FAILURE;
360 }
361
362 *entry++ = encoding;
363 n++;
364 }
365 if (n >= size || comma == NULL) {
366 break;
367 }
368 p1 = comma + 1;
369 }
370 *return_list = list;
371 *return_size = n;
372 }
373
374 return SUCCESS;
375}
376/* }}} */
377
378/* {{{
379 * Return FAILURE if input contains any illegal encoding, otherwise SUCCESS.
380 * Emits a ValueError in function context and a warning in INI context, in INI context arg_num must be 0.
381 */
382static zend_result php_mb_parse_encoding_array(HashTable *target_hash, const mbfl_encoding ***return_list,
383 size_t *return_size, uint32_t arg_num)
384{
385 /* Allocate enough space to include the default detect order if "auto" is used. */
386 size_t size = zend_hash_num_elements(target_hash) + MBSTRG(default_detect_order_list_size);
387 const mbfl_encoding **list = ecalloc(size, sizeof(mbfl_encoding*));
388 const mbfl_encoding **entry = list;
389 bool included_auto = 0;
390 size_t n = 0;
391 zval *hash_entry;
392 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
393 zend_string *encoding_str = zval_try_get_string(hash_entry);
394 if (UNEXPECTED(!encoding_str)) {
395 efree(ZEND_VOIDP(list));
396 return FAILURE;
397 }
398
399 if (zend_string_equals_literal_ci(encoding_str, "auto")) {
400 if (!included_auto) {
402 const size_t identify_list_size = MBSTRG(default_detect_order_list_size);
403 size_t j;
404
405 included_auto = 1;
406 for (j = 0; j < identify_list_size; j++) {
407 *entry++ = mbfl_no2encoding(*src++);
408 n++;
409 }
410 }
411 } else {
412 const mbfl_encoding *encoding = mbfl_name2encoding(ZSTR_VAL(encoding_str));
413 if (encoding) {
414 *entry++ = encoding;
415 n++;
416 } else {
417 zend_argument_value_error(arg_num, "contains invalid encoding \"%s\"", ZSTR_VAL(encoding_str));
418 zend_string_release(encoding_str);
419 efree(ZEND_VOIDP(list));
420 return FAILURE;
421 }
422 }
423 zend_string_release(encoding_str);
425 *return_list = list;
426 *return_size = n;
427 return SUCCESS;
428}
429/* }}} */
430
431/* {{{ zend_multibyte interface */
432static const zend_encoding* php_mb_zend_encoding_fetcher(const char *encoding_name)
433{
434 return (const zend_encoding*)mbfl_name2encoding(encoding_name);
435}
436
437static const char *php_mb_zend_encoding_name_getter(const zend_encoding *encoding)
438{
439 return ((const mbfl_encoding *)encoding)->name;
440}
441
442static bool php_mb_zend_encoding_lexer_compatibility_checker(const zend_encoding *_encoding)
443{
444 const mbfl_encoding *encoding = (const mbfl_encoding*)_encoding;
445 return !(encoding->flag & MBFL_ENCTYPE_GL_UNSAFE);
446}
447
448static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *arg_string, size_t arg_length, const zend_encoding **list, size_t list_size)
449{
450 if (!list) {
453 }
454 if (list_size == 1 && ((mbfl_encoding*)*list) == &mbfl_encoding_pass) {
455 /* Emulate behavior of previous implementation; it would never return "pass"
456 * from an encoding auto-detection operation */
457 return NULL;
458 }
459 return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding**)list, list_size, false, false);
460}
461
462static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
463{
464 unsigned int num_errors = 0;
465 zend_string *result = mb_fast_convert((unsigned char*)from, from_length, (const mbfl_encoding*)encoding_from, (const mbfl_encoding*)encoding_to, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
466
467 *to_length = ZSTR_LEN(result);
468 *to = emalloc(ZSTR_LEN(result) + 1); /* Include terminating null byte */
469 memcpy(*to, ZSTR_VAL(result), ZSTR_LEN(result) + 1);
470 zend_string_free(result);
471
472 return from_length;
473}
474
475static zend_result php_mb_zend_encoding_list_parser(const char *encoding_list, size_t encoding_list_len, const zend_encoding ***return_list, size_t *return_size, bool persistent)
476{
477 return php_mb_parse_encoding_list(
478 encoding_list, encoding_list_len,
479 (const mbfl_encoding ***)return_list, return_size,
480 persistent, /* arg_num */ 0);
481}
482
483static const zend_encoding *php_mb_zend_internal_encoding_getter(void)
484{
485 return (const zend_encoding *)MBSTRG(internal_encoding);
486}
487
488static zend_result php_mb_zend_internal_encoding_setter(const zend_encoding *encoding)
489{
491 return SUCCESS;
492}
493
494static zend_multibyte_functions php_mb_zend_multibyte_functions = {
495 "mbstring",
496 php_mb_zend_encoding_fetcher,
497 php_mb_zend_encoding_name_getter,
498 php_mb_zend_encoding_lexer_compatibility_checker,
499 php_mb_zend_encoding_detector,
500 php_mb_zend_encoding_converter,
501 php_mb_zend_encoding_list_parser,
502 php_mb_zend_internal_encoding_getter,
503 php_mb_zend_internal_encoding_setter
504};
505/* }}} */
506
507/* {{{ _php_mb_compile_regex */
508static void *_php_mb_compile_regex(const char *pattern)
509{
511 PCRE2_SIZE err_offset;
512 int errnum;
513
515 PCRE2_CASELESS, &errnum, &err_offset, php_pcre_cctx()))) {
516 PCRE2_UCHAR err_str[128];
517 pcre2_get_error_message(errnum, err_str, sizeof(err_str));
518 php_error_docref(NULL, E_WARNING, "%s (offset=%zu): %s", pattern, err_offset, err_str);
519 }
520 return retval;
521}
522/* }}} */
523
524/* {{{ _php_mb_match_regex */
525static int _php_mb_match_regex(void *opaque, const char *str, size_t str_len)
526{
527 int res;
528
529 pcre2_match_data *match_data = php_pcre_create_match_data(0, opaque);
530 if (NULL == match_data) {
531 pcre2_code_free(opaque);
532 php_error_docref(NULL, E_WARNING, "Cannot allocate match data");
533 return FAILURE;
534 }
535 res = pcre2_match(opaque, (PCRE2_SPTR)str, str_len, 0, 0, match_data, php_pcre_mctx()) >= 0;
536 php_pcre_free_match_data(match_data);
537
538 return res;
539}
540/* }}} */
541
542/* {{{ _php_mb_free_regex */
543static void _php_mb_free_regex(void *opaque)
544{
545 pcre2_code_free(opaque);
546}
547/* }}} */
548
549/* {{{ php_mb_nls_get_default_detect_order_list */
550static int php_mb_nls_get_default_detect_order_list(enum mbfl_no_language lang, enum mbfl_no_encoding **plist, size_t *plist_size)
551{
552 size_t i;
553
554 *plist = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
555 *plist_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
556
557 for (i = 0; i < sizeof(php_mb_default_identify_list) / sizeof(php_mb_default_identify_list[0]); i++) {
558 if (php_mb_default_identify_list[i].lang == lang) {
559 *plist = (enum mbfl_no_encoding *)php_mb_default_identify_list[i].list;
560 *plist_size = php_mb_default_identify_list[i].list_size;
561 return 1;
562 }
563 }
564 return 0;
565}
566/* }}} */
567
568static char *php_mb_rfc1867_substring_conf(const zend_encoding *encoding, char *start, size_t len, char quote)
569{
570 char *result = emalloc(len + 2);
571 char *resp = result;
572 size_t i;
573
574 for (i = 0; i < len && start[i] != quote; ++i) {
575 if (start[i] == '\\' && (start[i + 1] == '\\' || (quote && start[i + 1] == quote))) {
576 *resp++ = start[++i];
577 } else {
578 size_t j = php_mb_mbchar_bytes(start+i, (const mbfl_encoding *)encoding);
579
580 while (j-- > 0 && i < len) {
581 *resp++ = start[i++];
582 }
583 --i;
584 }
585 }
586
587 *resp = '\0';
588 return result;
589}
590
591static char *php_mb_rfc1867_getword(const zend_encoding *encoding, char **line, char stop) /* {{{ */
592{
593 char *pos = *line, quote;
594 char *res;
595
596 while (*pos && *pos != stop) {
597 if ((quote = *pos) == '"' || quote == '\'') {
598 ++pos;
599 while (*pos && *pos != quote) {
600 if (*pos == '\\' && pos[1] && pos[1] == quote) {
601 pos += 2;
602 } else {
603 ++pos;
604 }
605 }
606 if (*pos) {
607 ++pos;
608 }
609 } else {
611
612 }
613 }
614 if (*pos == '\0') {
615 res = estrdup(*line);
616 *line += strlen(*line);
617 return res;
618 }
619
620 res = estrndup(*line, pos - *line);
621
622 while (*pos == stop) {
624 }
625
626 *line = pos;
627 return res;
628}
629/* }}} */
630
631static char *php_mb_rfc1867_getword_conf(const zend_encoding *encoding, char *str) /* {{{ */
632{
633 while (*str && isspace(*(unsigned char *)str)) {
634 ++str;
635 }
636
637 if (!*str) {
638 return estrdup("");
639 }
640
641 if (*str == '"' || *str == '\'') {
642 char quote = *str;
643
644 str++;
645 return php_mb_rfc1867_substring_conf(encoding, str, strlen(str), quote);
646 } else {
647 char *strend = str;
648
649 while (*strend && !isspace(*(unsigned char *)strend)) {
650 ++strend;
651 }
652 return php_mb_rfc1867_substring_conf(encoding, str, strend - str, 0);
653 }
654}
655/* }}} */
656
657static char *php_mb_rfc1867_basename(const zend_encoding *encoding, char *filename) /* {{{ */
658{
659 char *s, *s2;
660 const size_t filename_len = strlen(filename);
661
662 /* The \ check should technically be needed for win32 systems only where
663 * it is a valid path separator. However, IE in all it's wisdom always sends
664 * the full path of the file on the user's filesystem, which means that unless
665 * the user does basename() they get a bogus file name. Until IE's user base drops
666 * to nill or problem is fixed this code must remain enabled for all systems. */
667 s = php_mb_safe_strrchr(filename, '\\', filename_len, (const mbfl_encoding *)encoding);
668 s2 = php_mb_safe_strrchr(filename, '/', filename_len, (const mbfl_encoding *)encoding);
669
670 if (s && s2) {
671 if (s > s2) {
672 return ++s;
673 } else {
674 return ++s2;
675 }
676 } else if (s) {
677 return ++s;
678 } else if (s2) {
679 return ++s2;
680 } else {
681 return filename;
682 }
683}
684/* }}} */
685
686/* {{{ php.ini directive handler */
687/* {{{ static PHP_INI_MH(OnUpdate_mbstring_language) */
688static PHP_INI_MH(OnUpdate_mbstring_language)
689{
690 enum mbfl_no_language no_language;
691
692 no_language = mbfl_name2no_language(ZSTR_VAL(new_value));
693 if (no_language == mbfl_no_language_invalid) {
695 return FAILURE;
696 }
697 MBSTRG(language) = no_language;
698 php_mb_nls_get_default_detect_order_list(no_language, &MBSTRG(default_detect_order_list), &MBSTRG(default_detect_order_list_size));
699 return SUCCESS;
700}
701/* }}} */
702
703/* {{{ static PHP_INI_MH(OnUpdate_mbstring_detect_order) */
704static PHP_INI_MH(OnUpdate_mbstring_detect_order)
705{
706 const mbfl_encoding **list;
707 size_t size;
708
709 if (!new_value) {
712 }
715 return SUCCESS;
716 }
717
718 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(new_value), ZSTR_LEN(new_value), &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
719 return FAILURE;
720 }
721
724 }
727 return SUCCESS;
728}
729/* }}} */
730
731static zend_result _php_mb_ini_mbstring_http_input_set(const char *new_value, size_t new_value_length) {
732 const mbfl_encoding **list;
733 size_t size;
734 if (new_value_length == 4 && strncmp(new_value, "pass", 4) == 0) {
735 list = (const mbfl_encoding**)pecalloc(1, sizeof(mbfl_encoding*), 1);
736 *list = &mbfl_encoding_pass;
737 size = 1;
738 } else if (FAILURE == php_mb_parse_encoding_list(new_value, new_value_length, &list, &size, /* persistent */ 1, /* arg_num */ 0) || size == 0) {
739 return FAILURE;
740 }
741 if (MBSTRG(http_input_list)) {
743 }
744 MBSTRG(http_input_list) = list;
746 return SUCCESS;
747}
748
749/* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_input) */
750static PHP_INI_MH(OnUpdate_mbstring_http_input)
751{
752 if (new_value) {
753 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_input is deprecated");
754 }
755
756 if (!new_value || !ZSTR_LEN(new_value)) {
757 const char *encoding = php_get_input_encoding();
759 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
760 return SUCCESS;
761 }
762
764 return _php_mb_ini_mbstring_http_input_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
765}
766/* }}} */
767
768static zend_result _php_mb_ini_mbstring_http_output_set(const char *new_value, size_t length) {
769 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(new_value, length);
770 if (!encoding) {
771 return FAILURE;
772 }
773
776 return SUCCESS;
777}
778
779/* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output) */
780static PHP_INI_MH(OnUpdate_mbstring_http_output)
781{
782 if (new_value) {
783 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.http_output is deprecated");
784 }
785
786 if (new_value == NULL || ZSTR_LEN(new_value) == 0) {
787 const char *encoding = php_get_output_encoding();
789 _php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
790 return SUCCESS;
791 }
792
794 return _php_mb_ini_mbstring_http_output_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
795}
796/* }}} */
797
798/* {{{ static _php_mb_ini_mbstring_internal_encoding_set */
799static zend_result _php_mb_ini_mbstring_internal_encoding_set(const char *new_value, size_t new_value_length)
800{
801 const mbfl_encoding *encoding;
802
803 if (!new_value || !new_value_length || !(encoding = mbfl_name2encoding(new_value))) {
804 /* falls back to UTF-8 if an unknown encoding name is given */
805 if (new_value) {
806 php_error_docref("ref.mbstring", E_WARNING, "Unknown encoding \"%s\" in ini setting", new_value);
807 }
809 }
812#ifdef HAVE_MBREGEX
813 {
814 const char *enc_name = new_value;
815 if (FAILURE == php_mb_regex_set_default_mbctype(enc_name)) {
816 /* falls back to UTF-8 if an unknown encoding name is given */
817 enc_name = "UTF-8";
818 php_mb_regex_set_default_mbctype(enc_name);
819 }
820 php_mb_regex_set_mbctype(new_value);
821 }
822#endif
823 return SUCCESS;
824}
825/* }}} */
826
827/* {{{ static PHP_INI_MH(OnUpdate_mbstring_internal_encoding) */
828static PHP_INI_MH(OnUpdate_mbstring_internal_encoding)
829{
830 if (new_value) {
831 php_error_docref("ref.mbstring", E_DEPRECATED, "Use of mbstring.internal_encoding is deprecated");
832 }
833
834 if (OnUpdateString(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage) == FAILURE) {
835 return FAILURE;
836 }
837
838 if (new_value && ZSTR_LEN(new_value)) {
840 return _php_mb_ini_mbstring_internal_encoding_set(ZSTR_VAL(new_value), ZSTR_LEN(new_value));
841 } else {
842 const char *encoding = php_get_internal_encoding();
844 return _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
845 }
846}
847/* }}} */
848
849/* {{{ static PHP_INI_MH(OnUpdate_mbstring_substitute_character) */
850static PHP_INI_MH(OnUpdate_mbstring_substitute_character)
851{
852 if (new_value != NULL) {
853 if (zend_string_equals_literal_ci(new_value, "none")) {
856 } else if (zend_string_equals_literal_ci(new_value, "long")) {
859 } else if (zend_string_equals_literal_ci(new_value, "entity")) {
862 } else {
865 if (ZSTR_LEN(new_value) > 0) {
866 char *endptr = NULL;
867 int c = strtol(ZSTR_VAL(new_value), &endptr, 0);
868
869 if (*endptr == '\0') {
872 }
873 }
874 }
875 } else {
880 }
881
882 return SUCCESS;
883}
884/* }}} */
885
886/* {{{ static PHP_INI_MH(OnUpdate_mbstring_encoding_translation) */
887static PHP_INI_MH(OnUpdate_mbstring_encoding_translation)
888{
889 if (new_value == NULL) {
890 return FAILURE;
891 }
892
893 OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
894
896 sapi_unregister_post_entry(php_post_entries);
897 sapi_register_post_entries(mbstr_post_entries);
898 } else {
899 sapi_unregister_post_entry(mbstr_post_entries);
900 sapi_register_post_entries(php_post_entries);
901 }
902
903 return SUCCESS;
904}
905/* }}} */
906
907/* {{{ static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes */
908static PHP_INI_MH(OnUpdate_mbstring_http_output_conv_mimetypes)
909{
910 zend_string *tmp;
911 void *re = NULL;
912
913 if (!new_value) {
914 new_value = entry->orig_value;
915 }
916 tmp = php_trim(new_value, NULL, 0, 3);
917
918 if (ZSTR_LEN(tmp) > 0) {
919 if (!(re = _php_mb_compile_regex(ZSTR_VAL(tmp)))) {
921 return FAILURE;
922 }
923 }
924
926 _php_mb_free_regex(MBSTRG(http_output_conv_mimetypes));
927 }
928
930
932 return SUCCESS;
933}
934/* }}} */
935/* }}} */
936
937/* {{{ php.ini directive registration */
939 PHP_INI_ENTRY("mbstring.language", "neutral", PHP_INI_ALL, OnUpdate_mbstring_language)
940 PHP_INI_ENTRY("mbstring.detect_order", NULL, PHP_INI_ALL, OnUpdate_mbstring_detect_order)
941 PHP_INI_ENTRY("mbstring.http_input", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_input)
942 PHP_INI_ENTRY("mbstring.http_output", NULL, PHP_INI_ALL, OnUpdate_mbstring_http_output)
943 STD_PHP_INI_ENTRY("mbstring.internal_encoding", NULL, PHP_INI_ALL, OnUpdate_mbstring_internal_encoding, internal_encoding_name, zend_mbstring_globals, mbstring_globals)
944 PHP_INI_ENTRY("mbstring.substitute_character", NULL, PHP_INI_ALL, OnUpdate_mbstring_substitute_character)
945
946 STD_PHP_INI_BOOLEAN("mbstring.encoding_translation", "0",
948 OnUpdate_mbstring_encoding_translation,
949 encoding_translation, zend_mbstring_globals, mbstring_globals)
950 PHP_INI_ENTRY("mbstring.http_output_conv_mimetypes",
951 "^(text/|application/xhtml\\+xml)",
953 OnUpdate_mbstring_http_output_conv_mimetypes)
954
955 STD_PHP_INI_BOOLEAN("mbstring.strict_detection", "0",
957 OnUpdateBool,
958 strict_detection, zend_mbstring_globals, mbstring_globals)
959#ifdef HAVE_MBREGEX
960 STD_PHP_INI_ENTRY("mbstring.regex_stack_limit", "100000",PHP_INI_ALL, OnUpdateLong, regex_stack_limit, zend_mbstring_globals, mbstring_globals)
961 STD_PHP_INI_ENTRY("mbstring.regex_retry_limit", "1000000",PHP_INI_ALL, OnUpdateLong, regex_retry_limit, zend_mbstring_globals, mbstring_globals)
962#endif
964/* }}} */
965
966static void mbstring_internal_encoding_changed_hook(void) {
967 /* One of the internal_encoding / input_encoding / output_encoding ini settings changed. */
969 const char *encoding = php_get_internal_encoding();
970 _php_mb_ini_mbstring_internal_encoding_set(encoding, strlen(encoding));
971 }
972
973 if (!MBSTRG(http_output_set)) {
974 const char *encoding = php_get_output_encoding();
975 _php_mb_ini_mbstring_http_output_set(encoding, strlen(encoding));
976 }
977
978 if (!MBSTRG(http_input_set)) {
979 const char *encoding = php_get_input_encoding();
980 _php_mb_ini_mbstring_http_input_set(encoding, strlen(encoding));
981 }
982}
983
984/* {{{ module global initialize handler */
985static PHP_GINIT_FUNCTION(mbstring)
986{
987#if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
989#endif
990
991 mbstring_globals->language = mbfl_no_language_uni;
992 mbstring_globals->internal_encoding = NULL;
993 mbstring_globals->current_internal_encoding = mbstring_globals->internal_encoding;
994 mbstring_globals->http_output_encoding = &mbfl_encoding_pass;
995 mbstring_globals->current_http_output_encoding = &mbfl_encoding_pass;
996 mbstring_globals->http_input_identify = NULL;
997 mbstring_globals->http_input_identify_get = NULL;
998 mbstring_globals->http_input_identify_post = NULL;
999 mbstring_globals->http_input_identify_cookie = NULL;
1000 mbstring_globals->http_input_identify_string = NULL;
1001 mbstring_globals->http_input_list = NULL;
1002 mbstring_globals->http_input_list_size = 0;
1003 mbstring_globals->detect_order_list = NULL;
1004 mbstring_globals->detect_order_list_size = 0;
1005 mbstring_globals->current_detect_order_list = NULL;
1006 mbstring_globals->current_detect_order_list_size = 0;
1007 mbstring_globals->default_detect_order_list = (enum mbfl_no_encoding *) php_mb_default_identify_list_neut;
1008 mbstring_globals->default_detect_order_list_size = sizeof(php_mb_default_identify_list_neut) / sizeof(php_mb_default_identify_list_neut[0]);
1009 mbstring_globals->filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1010 mbstring_globals->filter_illegal_substchar = '?';
1011 mbstring_globals->current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
1012 mbstring_globals->current_filter_illegal_substchar = '?';
1013 mbstring_globals->illegalchars = 0;
1014 mbstring_globals->encoding_translation = 0;
1015 mbstring_globals->strict_detection = 0;
1016 mbstring_globals->outconv_enabled = false;
1017 mbstring_globals->outconv_state = 0;
1018 mbstring_globals->http_output_conv_mimetypes = NULL;
1019#ifdef HAVE_MBREGEX
1020 mbstring_globals->mb_regex_globals = php_mb_regex_globals_alloc();
1021#endif
1022 mbstring_globals->last_used_encoding_name = NULL;
1023 mbstring_globals->last_used_encoding = NULL;
1024 mbstring_globals->internal_encoding_set = 0;
1025 mbstring_globals->http_output_set = 0;
1026 mbstring_globals->http_input_set = 0;
1027 mbstring_globals->all_encodings_list = NULL;
1028}
1029/* }}} */
1030
1031/* {{{ PHP_GSHUTDOWN_FUNCTION */
1032static PHP_GSHUTDOWN_FUNCTION(mbstring)
1033{
1034 if (mbstring_globals->http_input_list) {
1035 free(ZEND_VOIDP(mbstring_globals->http_input_list));
1036 }
1037 if (mbstring_globals->detect_order_list) {
1038 free(ZEND_VOIDP(mbstring_globals->detect_order_list));
1039 }
1040 if (mbstring_globals->http_output_conv_mimetypes) {
1041 _php_mb_free_regex(mbstring_globals->http_output_conv_mimetypes);
1042 }
1043#ifdef HAVE_MBREGEX
1044 php_mb_regex_globals_free(mbstring_globals->mb_regex_globals);
1045#endif
1046}
1047/* }}} */
1048
1049#ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1050static void init_check_utf8(void);
1051#endif
1052
1053/* {{{ PHP_MINIT_FUNCTION(mbstring) */
1055{
1056#if defined(COMPILE_DL_MBSTRING) && defined(ZTS)
1058#endif
1059
1061
1062 /* We assume that we're the only user of the hook. */
1064 php_internal_encoding_changed = mbstring_internal_encoding_changed_hook;
1065 mbstring_internal_encoding_changed_hook();
1066
1067 /* This is a global handler. Should not be set in a per-request handler. */
1068 sapi_register_treat_data(mbstr_treat_data);
1069
1070 /* Post handlers are stored in the thread-local context. */
1072 sapi_register_post_entries(mbstr_post_entries);
1073 }
1074
1075#ifdef HAVE_MBREGEX
1077#endif
1078
1079 register_mbstring_symbols(module_number);
1080
1081 if (FAILURE == zend_multibyte_set_functions(&php_mb_zend_multibyte_functions)) {
1082 return FAILURE;
1083 }
1084
1086 php_mb_encoding_translation,
1087 php_mb_gpc_get_detect_order,
1088 php_mb_gpc_set_input_encoding,
1089 php_mb_rfc1867_getword,
1090 php_mb_rfc1867_getword_conf,
1091 php_mb_rfc1867_basename);
1092
1093#ifdef ZEND_INTRIN_AVX2_FUNC_PTR
1094 init_check_utf8();
1095 init_convert_utf16();
1096#endif
1097
1098 return SUCCESS;
1099}
1100/* }}} */
1101
1102/* {{{ PHP_MSHUTDOWN_FUNCTION(mbstring) */
1104{
1106
1108
1109#ifdef HAVE_MBREGEX
1111#endif
1112
1114
1115 return SUCCESS;
1116}
1117/* }}} */
1118
1119/* {{{ PHP_RINIT_FUNCTION(mbstring) */
1138/* }}} */
1139
1140/* {{{ PHP_RSHUTDOWN_FUNCTION(mbstring) */
1142{
1147 }
1148
1149 /* clear http input identification. */
1155
1157 zend_string_release(MBSTRG(last_used_encoding_name));
1159 }
1160
1164
1165 MBSTRG(outconv_enabled) = false;
1166 MBSTRG(outconv_state) = 0;
1167
1172 }
1173
1174#ifdef HAVE_MBREGEX
1176#endif
1177
1178 return SUCCESS;
1179}
1180/* }}} */
1181
1182/* {{{ PHP_MINFO_FUNCTION(mbstring) */
1184{
1186 php_info_print_table_row(2, "Multibyte Support", "enabled");
1187 php_info_print_table_row(2, "Multibyte string engine", "libmbfl");
1188 php_info_print_table_row(2, "HTTP input encoding translation", MBSTRG(encoding_translation) ? "enabled": "disabled");
1189 {
1190 char tmp[256];
1191 snprintf(tmp, sizeof(tmp), "%d.%d.%d", MBFL_VERSION_MAJOR, MBFL_VERSION_MINOR, MBFL_VERSION_TEENY);
1192 php_info_print_table_row(2, "libmbfl version", tmp);
1193 }
1195
1197 php_info_print_table_header(1, "mbstring extension makes use of \"streamable kanji code filter and converter\", which is distributed under the GNU Lesser General Public License version 2.1.");
1199
1200#ifdef HAVE_MBREGEX
1202#endif
1203
1205}
1206/* }}} */
1207
1208/* {{{ Sets the current language or Returns the current language as a string */
1210{
1212
1217
1218 if (name == NULL) {
1220 } else {
1221 zend_string *ini_name = ZSTR_INIT_LITERAL("mbstring.language", 0);
1223 zend_argument_value_error(1, "must be a valid language, \"%s\" given", ZSTR_VAL(name));
1224 zend_string_release_ex(ini_name, 0);
1225 RETURN_THROWS();
1226 }
1227 // TODO Make return void
1229 zend_string_release_ex(ini_name, 0);
1230 }
1231}
1232/* }}} */
1233
1234/* {{{ Sets the current internal encoding or Returns the current internal encoding as a string */
1236{
1237 char *name = NULL;
1238 size_t name_len;
1239 const mbfl_encoding *encoding;
1240
1243 Z_PARAM_STRING_OR_NULL(name, name_len)
1245
1246 if (name == NULL) {
1249 } else {
1251 if (!encoding) {
1252 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1253 RETURN_THROWS();
1254 } else {
1257 /* TODO Return old encoding */
1259 }
1260 }
1261}
1262/* }}} */
1263
1264/* {{{ Returns the input encoding */
1266{
1267 char *type = NULL;
1268 size_t type_len = 0, n;
1269 const mbfl_encoding **entry;
1270 const mbfl_encoding *encoding;
1271
1274 Z_PARAM_STRING_OR_NULL(type, type_len)
1276
1277 if (type == NULL) {
1279 } else if (type_len != 1) {
1281 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1282 RETURN_THROWS();
1283 } else {
1284 switch (*type) {
1285 case 'G':
1286 case 'g':
1288 break;
1289 case 'P':
1290 case 'p':
1292 break;
1293 case 'C':
1294 case 'c':
1296 break;
1297 case 'S':
1298 case 's':
1300 break;
1301 case 'I':
1302 case 'i':
1303 entry = MBSTRG(http_input_list);
1306 for (size_t i = 0; i < n; i++, entry++) {
1307 add_next_index_string(return_value, (*entry)->name);
1308 }
1309 return;
1310 case 'L':
1311 case 'l':
1312 entry = MBSTRG(http_input_list);
1314 if (n == 0) {
1316 }
1317
1318 smart_str result = {0};
1319 for (size_t i = 0; i < n; i++, entry++) {
1320 if (i > 0) {
1321 smart_str_appendc(&result, ',');
1322 }
1323 smart_str_appends(&result, (*entry)->name);
1324 }
1325 RETURN_STR(smart_str_extract(&result));
1326 default:
1328 "must be one of \"G\", \"P\", \"C\", \"S\", \"I\", or \"L\"");
1329 RETURN_THROWS();
1330 }
1331 }
1332
1333 if (encoding) {
1334 RETURN_STRING(encoding->name);
1335 } else {
1337 }
1338}
1339/* }}} */
1340
1341/* {{{ Sets the current output_encoding or returns the current output_encoding as a string */
1343{
1344 char *name = NULL;
1345 size_t name_len;
1346
1349 Z_PARAM_PATH_OR_NULL(name, name_len) /* For null byte check */
1351
1352 if (name == NULL) {
1355 } else {
1356 const mbfl_encoding *encoding = php_mb_get_encoding_or_pass(name, name_len);
1357 if (!encoding) {
1358 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1359 RETURN_THROWS();
1360 } else {
1363 /* TODO Return previous encoding? */
1365 }
1366 }
1367}
1368/* }}} */
1369
1370/* {{{ Sets the current detect_order or Return the current detect_order as an array */
1372{
1373 zend_string *order_str = NULL;
1374 HashTable *order_ht = NULL;
1375
1378 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(order_ht, order_str)
1380
1381 if (!order_str && !order_ht) {
1385 for (size_t i = 0; i < n; i++) {
1386 add_next_index_string(return_value, (*entry)->name);
1387 entry++;
1388 }
1389 } else {
1390 const mbfl_encoding **list;
1391 size_t size;
1392 if (order_ht) {
1393 if (FAILURE == php_mb_parse_encoding_array(order_ht, &list, &size, 1)) {
1394 RETURN_THROWS();
1395 }
1396 } else {
1397 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(order_str), ZSTR_LEN(order_str), &list, &size, /* persistent */ 0, /* arg_num */ 1)) {
1398 RETURN_THROWS();
1399 }
1400 }
1401
1402 if (size == 0) {
1403 efree(ZEND_VOIDP(list));
1404 zend_argument_value_error(1, "must specify at least one encoding");
1405 RETURN_THROWS();
1406 }
1407
1410 }
1414 }
1415}
1416/* }}} */
1417
1418static inline bool php_mb_check_code_point(zend_long cp)
1419{
1420 if (cp < 0 || cp >= 0x110000) {
1421 /* Out of Unicode range */
1422 return false;
1423 }
1424
1425 if (cp >= 0xd800 && cp <= 0xdfff) {
1426 /* Surrogate code-point. These are never valid on their own and we only allow a single
1427 * substitute character. */
1428 return false;
1429 }
1430
1431 /* As we do not know the target encoding of the conversion operation that is going to
1432 * use the substitution character, we cannot check whether the codepoint is actually mapped
1433 * in the given encoding at this point. Thus we have to accept everything. */
1434 return true;
1435}
1436
1437/* {{{ Sets the current substitute_character or returns the current substitute_character */
1439{
1440 zend_string *substitute_character = NULL;
1441 zend_long substitute_codepoint;
1442 bool substitute_is_null = 1;
1443
1446 Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
1448
1449 if (substitute_is_null) {
1451 RETURN_STRING("none");
1452 }
1454 RETURN_STRING("long");
1455 }
1457 RETURN_STRING("entity");
1458 }
1460 }
1461
1462 if (substitute_character != NULL) {
1463 if (zend_string_equals_literal_ci(substitute_character, "none")) {
1466 }
1467 if (zend_string_equals_literal_ci(substitute_character, "long")) {
1470 }
1471 if (zend_string_equals_literal_ci(substitute_character, "entity")) {
1474 }
1475 /* Invalid string value */
1476 zend_argument_value_error(1, "must be \"none\", \"long\", \"entity\" or a valid codepoint");
1477 RETURN_THROWS();
1478 }
1479 /* Integer codepoint passed */
1480 if (!php_mb_check_code_point(substitute_codepoint)) {
1481 zend_argument_value_error(1, "is not a valid codepoint");
1482 RETURN_THROWS();
1483 }
1484
1486 MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
1488}
1489/* }}} */
1490
1491/* {{{ Return the preferred MIME name (charset) as a string */
1493{
1494 char *name = NULL;
1495 size_t name_len;
1496
1498 Z_PARAM_STRING(name, name_len)
1500
1502 if (enc == NULL) {
1503 zend_argument_value_error(1, "must be a valid encoding, \"%s\" given", name);
1504 RETURN_THROWS();
1505 }
1506
1507 const char *preferred_name = mbfl_encoding_preferred_mime_name(enc);
1508 if (preferred_name == NULL || *preferred_name == '\0') {
1509 php_error_docref(NULL, E_WARNING, "No MIME preferred name corresponding to \"%s\"", name);
1511 } else {
1512 RETVAL_STRING((char *)preferred_name);
1513 }
1514}
1515/* }}} */
1516
1517/* {{{ Parses GET/POST/COOKIE data and sets global variables */
1519{
1520 zval *track_vars_array = NULL;
1521 char *encstr;
1522 size_t encstr_len;
1524 const mbfl_encoding *detected;
1525
1527 Z_PARAM_STRING(encstr, encstr_len)
1528 Z_PARAM_ZVAL(track_vars_array)
1530
1531 track_vars_array = zend_try_array_init(track_vars_array);
1532 if (!track_vars_array) {
1533 RETURN_THROWS();
1534 }
1535
1536 encstr = estrndup(encstr, encstr_len);
1537
1538 info.data_type = PARSE_STRING;
1539 info.separator = PG(arg_separator).input;
1540 info.report_errors = true;
1544
1545 detected = _php_mb_encoding_handler_ex(&info, track_vars_array, encstr);
1546
1547 MBSTRG(http_input_identify) = detected;
1548
1549 RETVAL_BOOL(detected);
1550
1551 if (encstr != NULL) efree(encstr);
1552}
1553/* }}} */
1554
1556{
1557 zend_string *str;
1558 zend_long arg_status;
1559
1561 Z_PARAM_STR(str)
1562 Z_PARAM_LONG(arg_status)
1564
1566 if (encoding == &mbfl_encoding_pass) {
1567 RETURN_STR_COPY(str);
1568 }
1569
1570 if (arg_status & PHP_OUTPUT_HANDLER_START) {
1571 bool free_mimetype = false;
1572 char *mimetype = NULL;
1573
1574 /* Analyze mime type */
1575 if (SG(sapi_headers).mimetype
1577 && _php_mb_match_regex(MBSTRG(http_output_conv_mimetypes), SG(sapi_headers).mimetype, strlen(SG(sapi_headers).mimetype))) {
1578 char *s;
1579 if ((s = strchr(SG(sapi_headers).mimetype, ';')) == NULL) {
1580 mimetype = estrdup(SG(sapi_headers).mimetype);
1581 } else {
1582 mimetype = estrndup(SG(sapi_headers).mimetype, s - SG(sapi_headers).mimetype);
1583 }
1584 free_mimetype = true;
1585 } else if (SG(sapi_headers).send_default_content_type) {
1586 mimetype = SG(default_mimetype) ? SG(default_mimetype) : SAPI_DEFAULT_MIMETYPE;
1587 }
1588
1589 /* If content-type is not yet set, set it and enable conversion */
1590 if (SG(sapi_headers).send_default_content_type || free_mimetype) {
1591 const char *charset = encoding->mime_name;
1592 if (charset) {
1593 char *p;
1594 size_t len = spprintf(&p, 0, "Content-Type: %s; charset=%s", mimetype, charset);
1595 if (sapi_add_header(p, len, 0) != FAILURE) {
1596 SG(sapi_headers).send_default_content_type = 0;
1597 }
1598 }
1599
1600 MBSTRG(outconv_enabled) = true;
1601 }
1602
1603 if (free_mimetype) {
1604 efree(mimetype);
1605 }
1606 }
1607
1608 if (!MBSTRG(outconv_enabled)) {
1609 RETURN_STR_COPY(str);
1610 }
1611
1614
1615 uint32_t wchar_buf[128];
1616 unsigned char *in = (unsigned char*)ZSTR_VAL(str);
1617 size_t in_len = ZSTR_LEN(str);
1618 bool last_feed = ((arg_status & PHP_OUTPUT_HANDLER_END) != 0);
1619
1620 while (in_len) {
1621 size_t out_len = MBSTRG(current_internal_encoding)->to_wchar(&in, &in_len, wchar_buf, 128, &MBSTRG(outconv_state));
1622 ZEND_ASSERT(out_len <= 128);
1623 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len && last_feed);
1624 }
1625
1626 MBSTRG(illegalchars) += buf.errors;
1627 RETVAL_STR(mb_convert_buf_result_raw(&buf));
1628
1629 if (last_feed) {
1630 MBSTRG(outconv_enabled) = false;
1631 MBSTRG(outconv_state) = 0;
1632 }
1633}
1634
1636{
1637 zend_string *str, *encoding = NULL;
1638 zend_long split_len = 1;
1639
1641 Z_PARAM_STR(str)
1643 Z_PARAM_LONG(split_len)
1646
1647 if (split_len <= 0) {
1648 zend_argument_value_error(2, "must be greater than 0");
1649 RETURN_THROWS();
1650 } else if (split_len > UINT_MAX / 4) {
1651 zend_argument_value_error(2, "is too large");
1652 RETURN_THROWS();
1653 }
1654
1655 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
1656 if (!enc) {
1657 RETURN_THROWS();
1658 }
1659
1660 if (ZSTR_LEN(str) == 0) {
1662 }
1663
1664 unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
1665
1666 unsigned int char_len = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1667 if (char_len) {
1668 unsigned int chunk_len = char_len * split_len;
1669 unsigned int chunks = ((ZSTR_LEN(str) / chunk_len) + split_len - 1) / split_len; /* round up */
1671 while (p < e) {
1672 add_next_index_stringl(return_value, (const char*)p, MIN(chunk_len, e - p));
1673 p += chunk_len;
1674 }
1675 } else if (enc->mblen_table) {
1676 unsigned char const *mbtab = enc->mblen_table;
1677
1678 /* Assume that we have 1-byte characters */
1679 array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1680
1681 while (p < e) {
1682 unsigned char *chunk = p; /* start of chunk */
1683
1684 for (int char_count = 0; char_count < split_len && p < e; char_count++) {
1685 p += mbtab[*p];
1686 }
1687 if (p > e) {
1688 p = e; /* ensure chunk is in bounds */
1689 }
1690 add_next_index_stringl(return_value, (const char*)chunk, p - chunk);
1691 }
1692 } else {
1693 /* Assume that we have 1-byte characters */
1694 array_init_size(return_value, (ZSTR_LEN(str) + split_len - 1) / split_len);
1695
1696 uint32_t wchar_buf[128];
1697 size_t in_len = ZSTR_LEN(str);
1698 unsigned int state = 0, char_count = 0;
1699
1701
1702 while (in_len) {
1703 size_t out_len = enc->to_wchar(&p, &in_len, wchar_buf, 128, &state);
1704 ZEND_ASSERT(out_len <= 128);
1705 size_t i = 0;
1706
1707 /* Is there some output remaining from the previous iteration? */
1708 if (char_count) {
1709 if (out_len >= split_len - char_count) {
1710 /* Finish off an incomplete chunk from previous iteration
1711 * ('buf' was already initialized; we don't need to do it again) */
1712 enc->from_wchar(wchar_buf, split_len - char_count, &buf, true);
1713 i += split_len - char_count;
1714 char_count = 0;
1715 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1716 } else {
1717 /* Output from this iteration is not enough to finish the next chunk;
1718 * output what we can, and leave 'buf' to be used again on next iteration */
1719 enc->from_wchar(wchar_buf, out_len, &buf, !in_len);
1720 char_count += out_len;
1721 continue;
1722 }
1723 }
1724
1725 while (i < out_len) {
1726 /* Prepare for the next chunk */
1728
1729 if (out_len - i >= split_len) {
1730 enc->from_wchar(wchar_buf + i, split_len, &buf, true);
1731 i += split_len;
1732 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1733 } else {
1734 /* The remaining codepoints in wchar_buf aren't enough to finish a chunk;
1735 * leave them for the next iteration */
1736 enc->from_wchar(wchar_buf + i, out_len - i, &buf, !in_len);
1737 char_count = out_len - i;
1738 break;
1739 }
1740 }
1741 }
1742
1743 if (char_count) {
1744 /* The main loop above has finished processing the input string, but
1745 * has left a partial chunk in 'buf' */
1746 add_next_index_str(return_value, mb_convert_buf_result(&buf, enc));
1747 }
1748 }
1749}
1750
1751#ifdef __SSE2__
1752/* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r)
1753 * From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
1754 * Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all
1755 * 16 of them, returning the sum in an ordinary scalar register */
1756static inline uint32_t _mm_sum_epu8(const __m128i v)
1757{
1758 /* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register
1759 * _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers,
1760 * sums up those differences, and stores them as two 16-byte integers in the top and bottom
1761 * halves of the destination XMM register
1762 * By using a zeroed-out XMM register as one operand, we ensure the "differences" which are
1763 * summed up will actually just be the 8-bit values from `v` */
1764 __m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
1765 /* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have
1766 * to extract it here; but it stored the sum as two different 16-bit values
1767 * _mm_cvtsi128_si32 extracts one of those values into a scalar register
1768 * _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */
1769 return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
1770}
1771#endif
1772
1773/* This assumes that `string` is valid UTF-8
1774 * In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes)
1775 * Interpreted as signed integers, those are all byte values less than -64
1776 * A fast way to get the length of a UTF-8 string is to start with its byte length,
1777 * then subtract off the number of continuation bytes */
1778static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
1779{
1780 unsigned char *e = p + len;
1781
1782#ifdef __SSE2__
1783 if (len >= sizeof(__m128i)) {
1784 e -= sizeof(__m128i);
1785
1786 const __m128i threshold = _mm_set1_epi8(-64);
1787 const __m128i delta = _mm_set1_epi8(1);
1788 __m128i counter = _mm_setzero_si128(); /* Vector of 16 continuation-byte counters */
1789
1790 unsigned char reset_counter = 255;
1791 do {
1792 __m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
1793 __m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */
1794 counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */
1795
1796 /* The counters can only go up to 255, so every 255 iterations, fold them into `len`
1797 * and reset them to zero */
1798 if (--reset_counter == 0) {
1799 len -= _mm_sum_epu8(counter);
1800 counter = _mm_setzero_si128();
1801 reset_counter = 255;
1802 }
1803
1804 p += sizeof(__m128i);
1805 } while (p <= e);
1806
1807 e += sizeof(__m128i);
1808 len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */
1809 }
1810#endif
1811
1812 /* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */
1813 while (p < e) {
1814 signed char c = *p++;
1815 if (c < -64) {
1816 len--;
1817 }
1818 }
1819
1820 return len;
1821}
1822
1823static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
1824{
1825 unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
1826 if (char_len) {
1827 return ZSTR_LEN(string) / char_len;
1828 } else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) {
1829 return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
1830 }
1831
1832 uint32_t wchar_buf[128];
1833 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
1834 size_t in_len = ZSTR_LEN(string);
1835 unsigned int state = 0;
1836 size_t len = 0;
1837
1838 while (in_len) {
1839 len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
1840 }
1841
1842 return len;
1843}
1844
1845/* {{{ Get character numbers of a string */
1847{
1848 zend_string *string, *enc_name = NULL;
1849
1851 Z_PARAM_STR(string)
1853 Z_PARAM_STR_OR_NULL(enc_name)
1855
1856 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
1857 if (!enc) {
1858 RETURN_THROWS();
1859 }
1860
1861 RETVAL_LONG(mb_get_strlen(string, enc));
1862}
1863/* }}} */
1864
1865/* See mbfl_no_encoding definition for list of UTF-8 encodings */
1866static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc)
1867{
1868 return (no_enc >= mbfl_no_encoding_utf8 && no_enc <= mbfl_no_encoding_utf8_sb);
1869}
1870
1871static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char *end, ssize_t offset) {
1872 if (offset < 0) {
1873 unsigned char *pos = end;
1874 while (offset < 0) {
1875 if (pos <= str) {
1876 return NULL;
1877 }
1878
1879 unsigned char c = *--pos;
1880 if (c < 0x80 || (c & 0xC0) != 0x80) {
1881 offset++;
1882 }
1883 }
1884 return pos;
1885 } else {
1886 const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
1887 unsigned char *pos = str;
1888 while (offset-- > 0) {
1889 if (pos >= end) {
1890 return NULL;
1891 }
1892 pos += u8_tbl[*pos];
1893 }
1894 return pos;
1895 }
1896}
1897
1898static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) {
1899 return mb_fast_strlen_utf8(start, pos - start);
1900}
1901
1902static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse)
1903{
1904 size_t result;
1905 zend_string *haystack_u8 = NULL, *needle_u8 = NULL;
1906 unsigned char *offset_pointer;
1907
1908 if (!php_mb_is_no_encoding_utf8(enc->no_encoding)) {
1909 unsigned int num_errors = 0;
1910 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1911 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
1912 } else {
1913 haystack_u8 = haystack;
1914 needle_u8 = needle;
1915 }
1916
1917 offset_pointer = offset_to_pointer_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), offset);
1918 if (!offset_pointer) {
1920 goto out;
1921 }
1922
1924 if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
1925 goto out;
1926 }
1927
1928 const char *found_pos;
1929 if (!reverse) {
1930 found_pos = zend_memnstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1931 } else if (offset >= 0) {
1932 found_pos = zend_memnrstr((const char*)offset_pointer, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8));
1933 } else {
1934 size_t needle_len = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(needle), (unsigned char*)ZSTR_VAL(needle) + ZSTR_LEN(needle));
1935 offset_pointer = offset_to_pointer_utf8(offset_pointer, (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8), needle_len);
1936 if (!offset_pointer) {
1937 offset_pointer = (unsigned char*)ZSTR_VAL(haystack_u8) + ZSTR_LEN(haystack_u8);
1938 }
1939
1940 found_pos = zend_memnrstr(ZSTR_VAL(haystack_u8), ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), (const char*)offset_pointer);
1941 }
1942
1943 if (found_pos) {
1944 result = pointer_to_offset_utf8((unsigned char*)ZSTR_VAL(haystack_u8), (unsigned char*)found_pos);
1945 }
1946
1947out:
1948 if (haystack_u8 != haystack) {
1949 zend_string_free(haystack_u8);
1950 }
1951 if (needle_u8 != needle) {
1952 zend_string_free(needle_u8);
1953 }
1954 return result;
1955}
1956
1957static void handle_strpos_error(size_t error) {
1958 switch (error) {
1960 break;
1962 php_error_docref(NULL, E_WARNING, "Conversion error");
1963 break;
1964 case MBFL_ERROR_OFFSET:
1965 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
1966 break;
1967 default:
1968 zend_value_error("mb_strpos(): Unknown error");
1969 break;
1970 }
1971}
1972
1974{
1975 zend_long offset = 0;
1976 zend_string *needle, *haystack;
1977 zend_string *enc_name = NULL;
1978
1980 Z_PARAM_STR(haystack)
1981 Z_PARAM_STR(needle)
1984 Z_PARAM_STR_OR_NULL(enc_name)
1986
1987 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
1988 if (!enc) {
1989 RETURN_THROWS();
1990 }
1991
1992 size_t n = mb_find_strpos(haystack, needle, enc, offset, false);
1993 if (!mbfl_is_error(n)) {
1994 RETVAL_LONG(n);
1995 } else {
1996 handle_strpos_error(n);
1998 }
1999}
2000
2001/* {{{ Find position of last occurrence of a string within another */
2003{
2004 zend_long offset = 0;
2005 zend_string *needle, *haystack;
2006 zend_string *enc_name = NULL;
2007
2009 Z_PARAM_STR(haystack)
2010 Z_PARAM_STR(needle)
2013 Z_PARAM_STR_OR_NULL(enc_name)
2015
2016 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 4);
2017 if (!enc) {
2018 RETURN_THROWS();
2019 }
2020
2021 size_t n = mb_find_strpos(haystack, needle, enc, offset, true);
2022 if (!mbfl_is_error(n)) {
2023 RETVAL_LONG(n);
2024 } else {
2025 handle_strpos_error(n);
2027 }
2028}
2029/* }}} */
2030
2031/* {{{ Finds position of first occurrence of a string within another, case insensitive */
2033{
2034 zend_long offset = 0;
2035 zend_string *haystack, *needle;
2036 zend_string *from_encoding = NULL;
2037
2039 Z_PARAM_STR(haystack)
2040 Z_PARAM_STR(needle)
2043 Z_PARAM_STR_OR_NULL(from_encoding)
2045
2046 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2047 if (!enc) {
2048 RETURN_THROWS();
2049 }
2050
2051 size_t n = php_mb_stripos(false, haystack, needle, offset, enc);
2052
2053 if (!mbfl_is_error(n)) {
2054 RETVAL_LONG(n);
2055 } else {
2056 handle_strpos_error(n);
2058 }
2059}
2060/* }}} */
2061
2062/* {{{ Finds position of last occurrence of a string within another, case insensitive */
2064{
2065 zend_long offset = 0;
2066 zend_string *haystack, *needle;
2067 zend_string *from_encoding = NULL;
2068
2070 Z_PARAM_STR(haystack)
2071 Z_PARAM_STR(needle)
2074 Z_PARAM_STR_OR_NULL(from_encoding)
2076
2077 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 4);
2078 if (!enc) {
2079 RETURN_THROWS();
2080 }
2081
2082 size_t n = php_mb_stripos(true, haystack, needle, offset, enc);
2083
2084 if (!mbfl_is_error(n)) {
2085 RETVAL_LONG(n);
2086 } else {
2087 handle_strpos_error(n);
2089 }
2090}
2091/* }}} */
2092
2093static zend_string* mb_get_substr_slow(unsigned char *in, size_t in_len, size_t from, size_t len, const mbfl_encoding *enc)
2094{
2095 uint32_t wchar_buf[128];
2096 unsigned int state = 0;
2097
2099 mb_convert_buf_init(&buf, MIN(len, in_len - from), MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode));
2100
2101 while (in_len && len) {
2102 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2103 ZEND_ASSERT(out_len <= 128);
2104
2105 if (from >= out_len) {
2106 from -= out_len;
2107 } else {
2108 size_t needed_codepoints = MIN(out_len - from, len);
2109 enc->from_wchar(wchar_buf + from, needed_codepoints, &buf, !in_len || out_len >= len);
2110 from = 0;
2111 len -= needed_codepoints;
2112 }
2113 }
2114
2115 return mb_convert_buf_result(&buf, enc);
2116}
2117
2118static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, const mbfl_encoding *enc)
2119{
2120 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2121 size_t in_len = ZSTR_LEN(input);
2122
2123 if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
2124 /* Other than MacJapanese, no supported text encoding decodes to
2125 * more than one codepoint per byte
2126 * So if the number of codepoints to skip >= number of input bytes,
2127 * then definitely the output should be empty */
2128 return zend_empty_string;
2129 }
2130
2131 /* Does each codepoint have a fixed byte width? */
2132 unsigned int flag = enc->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2133 if (flag) {
2134 /* The value of the flag is 2 if each codepoint takes 2 bytes, or 4 if 4 bytes */
2135 from *= flag;
2136 len *= flag;
2137 if (from >= in_len) {
2138 return zend_empty_string;
2139 }
2140 in += from;
2141 in_len -= from;
2142 if (len > in_len) {
2143 len = in_len;
2144 }
2145 return zend_string_init_fast((const char*)in, len);
2146 }
2147
2148 return mb_get_substr_slow(in, in_len, from, len, enc);
2149}
2150
2151#define MB_STRSTR 1
2152#define MB_STRRCHR 2
2153#define MB_STRISTR 3
2154#define MB_STRRICHR 4
2155
2156static void php_mb_strstr_variants(INTERNAL_FUNCTION_PARAMETERS, unsigned int variant)
2157{
2158 bool reverse_mode = false, part = false;
2159 size_t n;
2160 zend_string *haystack, *needle;
2161 zend_string *encoding_name = NULL;
2162
2164 Z_PARAM_STR(haystack)
2165 Z_PARAM_STR(needle)
2167 Z_PARAM_BOOL(part)
2168 Z_PARAM_STR_OR_NULL(encoding_name)
2170
2171 const mbfl_encoding *enc = php_mb_get_encoding(encoding_name, 4);
2172 if (!enc) {
2173 RETURN_THROWS();
2174 }
2175
2176 if (variant == MB_STRRCHR || variant == MB_STRRICHR) {
2177 reverse_mode = true;
2178 }
2179
2180 if (variant == MB_STRISTR || variant == MB_STRRICHR) {
2181 n = php_mb_stripos(reverse_mode, haystack, needle, 0, enc);
2182 } else {
2183 n = mb_find_strpos(haystack, needle, enc, 0, reverse_mode);
2184 }
2185
2186 if (!mbfl_is_error(n)) {
2187 if (part) {
2188 RETVAL_STR(mb_get_substr(haystack, 0, n, enc));
2189 } else {
2190 RETVAL_STR(mb_get_substr(haystack, n, MBFL_SUBSTR_UNTIL_END, enc));
2191 }
2192 } else {
2193 // FIXME use handle_strpos_error(n)
2195 }
2196}
2197
2198/* {{{ Finds first occurrence of a string within another */
2200{
2201 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRSTR);
2202}
2203/* }}} */
2204
2205/* {{{ Finds the last occurrence of a character in a string within another */
2207{
2208 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRCHR);
2209}
2210/* }}} */
2211
2212/* {{{ Finds first occurrence of a string within another, case insensitive */
2214{
2215 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRISTR);
2216}
2217/* }}} */
2218
2219/* {{{ Finds the last occurrence of a character in a string within another, case insensitive */
2221{
2222 php_mb_strstr_variants(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_STRRICHR);
2223}
2224/* }}} */
2225
2226#undef MB_STRSTR
2227#undef MB_STRRCHR
2228#undef MB_STRISTR
2229#undef MB_STRRICHR
2230
2232{
2233 zend_string *haystack, *needle, *enc_name = NULL, *haystack_u8 = NULL, *needle_u8 = NULL;
2234
2236 Z_PARAM_STR(haystack)
2237 Z_PARAM_STR(needle)
2239 Z_PARAM_STR_OR_NULL(enc_name)
2241
2242 if (ZSTR_LEN(needle) == 0) {
2244 RETURN_THROWS();
2245 }
2246
2247 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 3);
2248 if (!enc) {
2249 RETURN_THROWS();
2250 }
2251
2252 if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
2253 /* No need to do any conversion if haystack/needle are already known-valid UTF-8
2254 * (If they are not valid, then not passing them through conversion filters could affect output) */
2255 if (ZSTR_IS_VALID_UTF8(haystack)) {
2256 haystack_u8 = haystack;
2257 } else {
2258 unsigned int num_errors = 0;
2259 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2260 if (!num_errors && !ZSTR_IS_INTERNED(haystack)) {
2262 }
2263 }
2264
2265 if (ZSTR_IS_VALID_UTF8(needle)) {
2266 needle_u8 = needle;
2267 } else {
2268 unsigned int num_errors = 0;
2269 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2270 if (!num_errors && !ZSTR_IS_INTERNED(needle)) {
2272 }
2273 }
2274 } else {
2275 unsigned int num_errors = 0;
2276 haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2277 needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
2278 /* A string with >0 bytes may convert to 0 codepoints; for example, the contents
2279 * may be only escape sequences */
2280 if (ZSTR_LEN(needle_u8) == 0) {
2281 zend_string_free(haystack_u8);
2282 zend_string_free(needle_u8);
2284 RETURN_THROWS();
2285 }
2286 }
2287
2288 size_t result = 0;
2289
2290 if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
2291 goto out;
2292 }
2293
2294 const char *p = ZSTR_VAL(haystack_u8), *e = p + ZSTR_LEN(haystack_u8);
2295 while (true) {
2296 p = zend_memnstr(p, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), e);
2297 if (!p) {
2298 break;
2299 }
2300 p += ZSTR_LEN(needle_u8);
2301 result++;
2302 }
2303
2304out:
2305 if (haystack_u8 != haystack) {
2306 zend_string_free(haystack_u8);
2307 }
2308 if (needle_u8 != needle) {
2309 zend_string_free(needle_u8);
2310 }
2311
2313}
2314
2315/* {{{ Returns part of a string */
2317{
2318 zend_string *str, *encoding = NULL;
2319 zend_long from, len;
2320 size_t real_from, real_len;
2321 bool len_is_null = true;
2322
2324 Z_PARAM_STR(str)
2325 Z_PARAM_LONG(from)
2327 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2330
2331 if (from == ZEND_LONG_MIN) {
2333 RETURN_THROWS();
2334 }
2335
2336 if (!len_is_null && len == ZEND_LONG_MIN) {
2338 RETURN_THROWS();
2339 }
2340
2341 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2342 if (!enc) {
2343 RETURN_THROWS();
2344 }
2345
2346 size_t mblen = 0;
2347 if (from < 0 || (!len_is_null && len < 0)) {
2348 mblen = mb_get_strlen(str, enc);
2349 }
2350
2351 /* if "from" position is negative, count start position from the end
2352 * of the string */
2353 if (from >= 0) {
2354 real_from = (size_t) from;
2355 } else if (-from < mblen) {
2356 real_from = mblen + from;
2357 } else {
2358 real_from = 0;
2359 }
2360
2361 /* if "length" position is negative, set it to the length
2362 * needed to stop that many chars from the end of the string */
2363 if (len_is_null) {
2364 real_len = MBFL_SUBSTR_UNTIL_END;
2365 } else if (len >= 0) {
2366 real_len = (size_t) len;
2367 } else if (real_from < mblen && -len < mblen - real_from) {
2368 real_len = (mblen - real_from) + len;
2369 } else {
2370 real_len = 0;
2371 }
2372
2373 RETVAL_STR(mb_get_substr(str, real_from, real_len, enc));
2374}
2375/* }}} */
2376
2377/* {{{ Returns part of a string */
2379{
2381 char *string_val;
2382 zend_long from, len;
2383 bool len_is_null = true;
2384 mbfl_string string, result, *ret;
2385
2387 Z_PARAM_STRING(string_val, string.len)
2388 Z_PARAM_LONG(from)
2390 Z_PARAM_LONG_OR_NULL(len, len_is_null)
2393
2394 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 4);
2395 if (!enc) {
2396 RETURN_THROWS();
2397 }
2398
2399 string.val = (unsigned char*)string_val;
2400 string.encoding = enc;
2401
2402 if (len_is_null) {
2403 len = string.len;
2404 }
2405
2406 /* if "from" position is negative, count start position from the end
2407 * of the string */
2408 if (from < 0) {
2409 from = string.len + from;
2410 if (from < 0) {
2411 from = 0;
2412 }
2413 }
2414
2415 /* if "length" position is negative, set it to the length
2416 * needed to stop that many chars from the end of the string */
2417 if (len < 0) {
2418 len = (string.len - from) + len;
2419 if (len < 0) {
2420 len = 0;
2421 }
2422 }
2423
2424 if (from > string.len || len == 0) {
2426 }
2427
2428 if (enc->cut) {
2429 RETURN_STR(enc->cut(string.val, from, len, string.val + string.len));
2430 }
2431
2432 unsigned int char_len = string.encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
2433 if (char_len) {
2434 /* Round `from` down to a multiple of `char_len`; works because `char_len` is a power of 2 */
2435 from &= -char_len;
2436 if (len > string.len - from) {
2437 len = string.len - from;
2438 }
2439 RETURN_STR(zend_string_init_fast((const char*)(string.val + from), len & -char_len));
2440 }
2441
2442 if (enc->mblen_table) {
2443 const unsigned char *mbtab = enc->mblen_table;
2444 const unsigned char *p, *q, *end;
2445 int m = 0;
2446 /* Search for start position */
2447 for (p = (const unsigned char*)string.val, q = p + from; p < q; p += (m = mbtab[*p]));
2448 if (p > q) {
2449 p -= m;
2450 }
2451 const unsigned char *start = p;
2452 /* Search for end position */
2453 if (len >= string.len - (start - (const unsigned char*)string.val)) {
2454 end = (const unsigned char*)(string.val + string.len);
2455 } else {
2456 for (q = p + len; p < q; p += (m = mbtab[*p]));
2457 if (p > q) {
2458 p -= m;
2459 }
2460 end = p;
2461 }
2462 RETURN_STR(zend_string_init_fast((const char*)start, end - start));
2463 }
2464
2465 ret = mbfl_strcut(&string, &result, from, len);
2466 ZEND_ASSERT(ret != NULL);
2467 RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
2468 efree(ret->val);
2469}
2470/* }}} */
2471
2472/* Some East Asian characters, when printed at a terminal (or the like), require double
2473 * the usual amount of horizontal space. We call these "fullwidth" characters. */
2474static size_t character_width(uint32_t c)
2475{
2477 return 1;
2478 }
2479
2480 /* Do a binary search to see if we fall in any of the fullwidth ranges */
2481 unsigned int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
2482 while (lo < hi) {
2483 unsigned int probe = (lo + hi) / 2;
2484 if (c < mbfl_eaw_table[probe].begin) {
2485 hi = probe;
2486 } else if (c > mbfl_eaw_table[probe].end) {
2487 lo = probe + 1;
2488 } else {
2489 return 2;
2490 }
2491 }
2492
2493 return 1;
2494}
2495
2496static size_t mb_get_strwidth(zend_string *string, const mbfl_encoding *enc)
2497{
2498 size_t width = 0;
2499 uint32_t wchar_buf[128];
2500 unsigned char *in = (unsigned char*)ZSTR_VAL(string);
2501 size_t in_len = ZSTR_LEN(string);
2502 unsigned int state = 0;
2503
2504 while (in_len) {
2505 size_t out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2506 ZEND_ASSERT(out_len <= 128);
2507
2508 while (out_len) {
2509 /* NOTE: 'bad input' marker will be counted as 1 unit of width
2510 * If text conversion is performed with an ordinary ASCII character as
2511 * the 'replacement character', this will give us the correct display width. */
2512 width += character_width(wchar_buf[--out_len]);
2513 }
2514 }
2515
2516 return width;
2517}
2518
2519/* Gets terminal width of a string */
2521{
2522 zend_string *string, *enc_name = NULL;
2523
2525 Z_PARAM_STR(string)
2527 Z_PARAM_STR_OR_NULL(enc_name)
2529
2530 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
2531 if (!enc) {
2532 RETURN_THROWS();
2533 }
2534
2535 RETVAL_LONG(mb_get_strwidth(string, enc));
2536}
2537
2538static zend_string* mb_trim_string(zend_string *input, zend_string *marker, const mbfl_encoding *enc, size_t from, size_t width)
2539{
2540 uint32_t wchar_buf[128];
2541 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
2542 size_t in_len = ZSTR_LEN(input);
2543 unsigned int state = 0;
2544 size_t remaining_width = width;
2545 size_t to_skip = from;
2546 size_t out_len = 0;
2547 bool first_call = true, input_err = false;
2549
2550 while (in_len) {
2551 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2552 ZEND_ASSERT(out_len <= 128);
2553
2554 if (out_len <= to_skip) {
2555 to_skip -= out_len;
2556 } else {
2557 for (size_t i = to_skip; i < out_len; i++) {
2558 uint32_t w = wchar_buf[i];
2559 size_t current_w_width = character_width(w);
2560
2561 input_err |= (w == MBFL_BAD_INPUT);
2562
2563 if (remaining_width < current_w_width) {
2564 size_t marker_width = mb_get_strwidth(marker, enc);
2565
2566 /* The trim marker is larger than the desired string width */
2567 if (width <= marker_width) {
2568 return zend_string_copy(marker);
2569 }
2570
2571 /* We need to truncate string and append trim marker */
2572 width -= marker_width;
2573 /* 'width' is now the amount we want to take from 'input' */
2575
2576 if (first_call) {
2577 /* We can use the buffer of wchars which we have right now;
2578 * no need to convert again */
2579 goto dont_restart_conversion;
2580 } else {
2581 goto restart_conversion;
2582 }
2583 }
2584 remaining_width -= current_w_width;
2585 }
2586 to_skip = 0;
2587 }
2588 first_call = false;
2589 }
2590
2591 /* The input string fits in the requested width; we don't need to append the trim marker
2592 * However, if the string contains erroneous byte sequences, those should be converted
2593 * to error markers */
2594 if (!input_err) {
2595 if (from == 0) {
2596 /* This just increments the string's refcount; it doesn't really 'copy' it */
2597 return zend_string_copy(input);
2598 } else {
2599 return mb_get_substr(input, from, MBFL_SUBSTR_UNTIL_END, enc);
2600 }
2601 } else {
2602 /* We can't use `mb_get_substr`, because it uses the fastest method possible of
2603 * picking out a substring, which may not include converting erroneous byte
2604 * sequences to error markers */
2605 return mb_get_substr_slow((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), from, MBFL_SUBSTR_UNTIL_END, enc);
2606 }
2607
2608 /* The input string is too wide; we need to build a new string which
2609 * includes some portion of the input string, with the trim marker
2610 * concatenated onto it */
2611restart_conversion:
2612 in = (unsigned char*)ZSTR_VAL(input);
2613 in_len = ZSTR_LEN(input);
2614 state = 0;
2615
2616 while (true) {
2617 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
2618 ZEND_ASSERT(out_len <= 128);
2619
2620dont_restart_conversion:
2621 if (out_len <= from) {
2622 from -= out_len;
2623 } else {
2624 for (size_t i = from; i < out_len; i++) {
2625 size_t current_wchar_char_width = character_width(wchar_buf[i]);
2626 if (width < current_wchar_char_width) {
2627 enc->from_wchar(wchar_buf + from, i - from, &buf, true);
2628 goto append_trim_marker;
2629 }
2630 width -= current_wchar_char_width;
2631 }
2632 ZEND_ASSERT(in_len > 0);
2633 enc->from_wchar(wchar_buf + from, out_len - from, &buf, false);
2634 from = 0;
2635 }
2636 }
2637
2638append_trim_marker:
2639 if (ZSTR_LEN(marker) > 0) {
2640 MB_CONVERT_BUF_ENSURE((&buf), buf.out, buf.limit, ZSTR_LEN(marker));
2641 buf.out = zend_mempcpy(buf.out, ZSTR_VAL(marker), ZSTR_LEN(marker));
2642 }
2643
2644 /* Even if `enc` is UTF-8, don't mark the output string as valid UTF-8, because
2645 * we have no guarantee that the trim marker string is valid UTF-8 */
2646 return mb_convert_buf_result_raw(&buf);
2647}
2648
2649/* Trim the string to terminal width; optional, add a 'trim marker' if it was truncated */
2651{
2652 zend_string *str, *trimmarker = zend_empty_string, *encoding = NULL;
2653 zend_long from, width;
2654
2656 Z_PARAM_STR(str)
2657 Z_PARAM_LONG(from)
2658 Z_PARAM_LONG(width)
2660 Z_PARAM_STR(trimmarker)
2663
2664 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 5);
2665 if (!enc) {
2666 RETURN_THROWS();
2667 }
2668
2669 if (from != 0) {
2670 size_t str_len = mb_get_strlen(str, enc);
2671 if (from < 0) {
2672 from += str_len;
2673 }
2674 if (from < 0 || from > str_len) {
2675 zend_argument_value_error(2, "is out of range");
2676 RETURN_THROWS();
2677 }
2678 }
2679
2680 if (width < 0) {
2682 "passing a negative integer to argument #3 ($width) is deprecated");
2683 width += mb_get_strwidth(str, enc);
2684
2685 if (from > 0) {
2686 zend_string *trimmed = mb_get_substr(str, 0, from, enc);
2687 width -= mb_get_strwidth(trimmed, enc);
2688 zend_string_free(trimmed);
2689 }
2690
2691 if (width < 0) {
2692 zend_argument_value_error(3, "is out of range");
2693 RETURN_THROWS();
2694 }
2695 }
2696
2697 RETVAL_STR(mb_trim_string(str, trimmarker, enc, from, width));
2698}
2699
2700
2701/* See mbfl_no_encoding definition for list of unsupported encodings */
2702static inline bool php_mb_is_unsupported_no_encoding(enum mbfl_no_encoding no_enc)
2703{
2704 return ((no_enc >= mbfl_no_encoding_invalid && no_enc <= mbfl_no_encoding_qprint)
2705 || (no_enc >= mbfl_no_encoding_utf7 && no_enc <= mbfl_no_encoding_utf7imap)
2706 || (no_enc >= mbfl_no_encoding_jis && no_enc <= mbfl_no_encoding_2022jpms)
2707 || (no_enc >= mbfl_no_encoding_cp50220 && no_enc <= mbfl_no_encoding_cp50222));
2708}
2709
2710MBSTRING_API zend_string* php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
2711{
2712 unsigned int num_errors = 0;
2713 zend_string *result = mb_fast_convert((unsigned char*)input, length, from_encoding, to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
2714 MBSTRG(illegalchars) += num_errors;
2715 return result;
2716}
2717
2718MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2719{
2720 const mbfl_encoding *from_encoding;
2721
2722 /* pre-conversion encoding */
2723 ZEND_ASSERT(num_from_encodings >= 1);
2724 if (num_from_encodings == 1) {
2725 from_encoding = *from_encodings;
2726 } else {
2727 /* auto detect */
2728 from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
2729 if (!from_encoding) {
2730 php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
2731 return NULL;
2732 }
2733 }
2734
2735 return php_mb_convert_encoding_ex(input, length, to_encoding, from_encoding);
2736}
2737
2738MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
2739{
2740 HashTable *output, *chash;
2741 zend_long idx;
2743 zval *entry, entry_tmp;
2744
2745 if (!input) {
2746 return NULL;
2747 }
2748
2749 if (GC_IS_RECURSIVE(input)) {
2751 php_error_docref(NULL, E_WARNING, "Cannot convert recursively referenced values");
2752 return NULL;
2753 }
2755 output = zend_new_array(zend_hash_num_elements(input));
2756 ZEND_HASH_FOREACH_KEY_VAL(input, idx, key, entry) {
2757 /* convert key */
2758 if (key) {
2759 zend_string *converted_key = php_mb_convert_encoding(ZSTR_VAL(key), ZSTR_LEN(key), to_encoding, from_encodings, num_from_encodings);
2760 if (!converted_key) {
2761 continue;
2762 }
2763 key = converted_key;
2764 }
2765 /* convert value */
2766 ZEND_ASSERT(entry);
2767try_again:
2768 switch(Z_TYPE_P(entry)) {
2769 case IS_STRING: {
2770 zend_string *converted_key = php_mb_convert_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), to_encoding, from_encodings, num_from_encodings);
2771 if (!converted_key) {
2772 if (key) {
2773 zend_string_release(key);
2774 }
2775 continue;
2776 }
2777 ZVAL_STR(&entry_tmp, converted_key);
2778 break;
2779 }
2780 case IS_NULL:
2781 case IS_TRUE:
2782 case IS_FALSE:
2783 case IS_LONG:
2784 case IS_DOUBLE:
2785 ZVAL_COPY(&entry_tmp, entry);
2786 break;
2787 case IS_ARRAY:
2789 Z_ARRVAL_P(entry), to_encoding, from_encodings, num_from_encodings);
2790 if (chash) {
2791 ZVAL_ARR(&entry_tmp, chash);
2792 } else {
2793 ZVAL_EMPTY_ARRAY(&entry_tmp);
2794 }
2795 break;
2796 case IS_REFERENCE:
2797 entry = Z_REFVAL_P(entry);
2798 goto try_again;
2799 case IS_OBJECT:
2800 default:
2801 if (key) {
2802 zend_string_release(key);
2803 }
2804 php_error_docref(NULL, E_WARNING, "Object is not supported");
2805 continue;
2806 }
2807 if (key) {
2808 zend_hash_add(output, key, &entry_tmp);
2809 zend_string_release(key);
2810 } else {
2811 zend_hash_index_add(output, idx, &entry_tmp);
2812 }
2815
2816 return output;
2817}
2818/* }}} */
2819
2820static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2821{
2822 /* mbstring supports some 'text encodings' which aren't really text encodings
2823 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
2824 * These should never be returned by `mb_detect_encoding`. */
2825 unsigned int shift = 0;
2826 for (unsigned int i = 0; i < *size; i++) {
2827 const mbfl_encoding *encoding = elist[i];
2828 if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2829 shift++; /* Remove this encoding from the list */
2830 } else if (shift) {
2831 elist[i - shift] = encoding;
2832 }
2833 }
2834 *size -= shift;
2835}
2836
2837/* {{{ Returns converted string in desired encoding */
2839{
2840 zend_string *to_encoding_name;
2841 zend_string *input_str, *from_encodings_str = NULL;
2842 HashTable *input_ht, *from_encodings_ht = NULL;
2843 const mbfl_encoding **from_encodings;
2844 size_t num_from_encodings;
2845 bool free_from_encodings = false;
2846
2848 Z_PARAM_ARRAY_HT_OR_STR(input_ht, input_str)
2849 Z_PARAM_STR(to_encoding_name)
2851 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(from_encodings_ht, from_encodings_str)
2853
2854 const mbfl_encoding *to_encoding = php_mb_get_encoding(to_encoding_name, 2);
2855 if (!to_encoding) {
2856 RETURN_THROWS();
2857 }
2858
2859 if (from_encodings_ht) {
2860 if (php_mb_parse_encoding_array(from_encodings_ht, &from_encodings, &num_from_encodings, 3) == FAILURE) {
2861 RETURN_THROWS();
2862 }
2863 free_from_encodings = true;
2864 } else if (from_encodings_str) {
2865 if (php_mb_parse_encoding_list(ZSTR_VAL(from_encodings_str), ZSTR_LEN(from_encodings_str),
2866 &from_encodings, &num_from_encodings,
2867 /* persistent */ 0, /* arg_num */ 3) == FAILURE) {
2868 RETURN_THROWS();
2869 }
2870 free_from_encodings = true;
2871 } else {
2872 from_encodings = &MBSTRG(current_internal_encoding);
2873 num_from_encodings = 1;
2874 }
2875
2876 if (num_from_encodings > 1) {
2877 remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2878 }
2879
2880 if (!num_from_encodings) {
2881 efree(ZEND_VOIDP(from_encodings));
2882 zend_argument_value_error(3, "must specify at least one encoding");
2883 RETURN_THROWS();
2884 }
2885
2886 if (input_str) {
2887 zend_string *ret = php_mb_convert_encoding(ZSTR_VAL(input_str), ZSTR_LEN(input_str), to_encoding, from_encodings, num_from_encodings);
2888 if (ret != NULL) {
2889 RETVAL_STR(ret);
2890 } else {
2892 }
2893 } else {
2894 HashTable *tmp;
2896 input_ht, to_encoding, from_encodings, num_from_encodings);
2897 RETVAL_ARR(tmp);
2898 }
2899
2900 if (free_from_encodings) {
2901 efree(ZEND_VOIDP(from_encodings));
2902 }
2903}
2904/* }}} */
2905
2906static zend_string *mbstring_convert_case(php_case_mode case_mode, const char *str, size_t str_len, const mbfl_encoding *enc)
2907{
2909}
2910
2912{
2913 zend_string *str, *from_encoding = NULL;
2914 zend_long case_mode = 0;
2915
2917 Z_PARAM_STR(str)
2918 Z_PARAM_LONG(case_mode)
2920 Z_PARAM_STR_OR_NULL(from_encoding)
2922
2923 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 3);
2924 if (!enc) {
2925 RETURN_THROWS();
2926 }
2927
2928 if (case_mode < 0 || case_mode >= PHP_UNICODE_CASE_MODE_MAX) {
2929 zend_argument_value_error(2, "must be one of the MB_CASE_* constants");
2930 RETURN_THROWS();
2931 }
2932
2933 RETURN_STR(mbstring_convert_case(case_mode, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2934}
2935
2937{
2938 zend_string *str, *from_encoding = NULL;
2939
2941 Z_PARAM_STR(str)
2943 Z_PARAM_STR_OR_NULL(from_encoding)
2945
2946 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2947 if (!enc) {
2948 RETURN_THROWS();
2949 }
2950
2951 RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_UPPER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2952}
2953
2955{
2956 zend_string *str, *from_encoding = NULL;
2957
2959 Z_PARAM_STR(str)
2961 Z_PARAM_STR_OR_NULL(from_encoding)
2963
2964 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2965 if (!enc) {
2966 RETURN_THROWS();
2967 }
2968
2969 RETURN_STR(mbstring_convert_case(PHP_UNICODE_CASE_LOWER, ZSTR_VAL(str), ZSTR_LEN(str), enc));
2970}
2971
2972static void php_mb_ulcfirst(INTERNAL_FUNCTION_PARAMETERS, php_case_mode mode)
2973{
2974 zend_string *str, *from_encoding = NULL;
2975
2977 Z_PARAM_STR(str)
2979 Z_PARAM_STR_OR_NULL(from_encoding)
2981
2982 const mbfl_encoding *enc = php_mb_get_encoding(from_encoding, 2);
2983 if (!enc) {
2984 RETURN_THROWS();
2985 }
2986
2987 zend_string *first = mb_get_substr(str, 0, 1, enc);
2988 zend_string *head = mbstring_convert_case(mode, ZSTR_VAL(first), ZSTR_LEN(first), enc);
2989
2990 if (zend_string_equals(first, head)) {
2991 zend_string_release_ex(first, false);
2993 RETURN_STR(zend_string_copy(str));
2994 }
2995
2996 zend_string *second = mb_get_substr(str, 1, MBFL_SUBSTR_UNTIL_END, enc);
2998
2999 zend_string_release_ex(first, false);
3001 zend_string_release_ex(second, false);
3002
3004}
3005
3010
3015
3016typedef enum {
3020} mb_trim_mode;
3021
3022static bool is_trim_wchar(uint32_t w, const HashTable *ht, const uint32_t *default_chars, size_t default_chars_length)
3023{
3024 if (ht) {
3025 return zend_hash_index_exists(ht, w);
3026 } else {
3027 for (size_t i = 0; i < default_chars_length; i++) {
3028 if (w == default_chars[i]) {
3029 return true;
3030 }
3031 }
3032 return false;
3033 }
3034}
3035
3036static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, const uint32_t *default_chars, size_t default_chars_length, mb_trim_mode mode, const mbfl_encoding *enc)
3037{
3038 unsigned char *in = (unsigned char*)ZSTR_VAL(str);
3039 uint32_t wchar_buf[128];
3040 size_t in_len = ZSTR_LEN(str);
3041 size_t out_len = 0;
3042 unsigned int state = 0;
3043 size_t left = 0;
3044 size_t right = 0;
3045 size_t total_len = 0;
3046
3047 while (in_len) {
3048 out_len = enc->to_wchar(&in, &in_len, wchar_buf, 128, &state);
3049 ZEND_ASSERT(out_len <= 128);
3050 total_len += out_len;
3051
3052 for (size_t i = 0; i < out_len; i++) {
3053 uint32_t w = wchar_buf[i];
3054 if (is_trim_wchar(w, what_ht, default_chars, default_chars_length)) {
3055 if (mode & MB_LTRIM) {
3056 left += 1;
3057 }
3058 if (mode & MB_RTRIM) {
3059 right += 1;
3060 }
3061 } else {
3062 mode &= ~MB_LTRIM;
3063 if (mode & MB_RTRIM) {
3064 right = 0;
3065 }
3066 }
3067 }
3068 }
3069
3070 if (left == 0 && right == 0) {
3071 return zend_string_copy(str);
3072 }
3073 return mb_get_substr(str, left, total_len - (right + left), enc);
3074}
3075
3076static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, const mbfl_encoding *enc)
3077{
3078 const uint32_t trim_default_chars[] = {
3079 0x20, 0x0C, 0x0A, 0x0D, 0x09, 0x0B, 0x00, 0xA0, 0x1680,
3080 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007,
3081 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000,
3082 0x85, 0x180E
3083 };
3084 size_t trim_default_chars_length = sizeof(trim_default_chars) / sizeof(uint32_t);
3085
3086 HashTable what_ht;
3087 zval val;
3088 ZVAL_TRUE(&val);
3089
3090 zend_hash_init(&what_ht, trim_default_chars_length, NULL, NULL, false);
3091
3092 for (size_t i = 0; i < trim_default_chars_length; i++) {
3093 zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
3094 }
3095 zend_string* retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3096 zend_hash_destroy(&what_ht);
3097
3098 return retval;
3099}
3100
3101static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_trim_mode mode, const mbfl_encoding *enc)
3102{
3103 unsigned char *what_in = (unsigned char*)ZSTR_VAL(what);
3104 uint32_t what_wchar_buf[128];
3105 size_t what_out_len = 0;
3106 unsigned int state = 0;
3107 size_t what_len = ZSTR_LEN(what);
3108 HashTable what_ht;
3109 zval val;
3110 bool hash_initialized = false;
3111
3112 while (what_len) {
3113 what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
3114 ZEND_ASSERT(what_out_len <= 128);
3115
3116 if (what_out_len <= 4 && !hash_initialized) {
3117 return trim_each_wchar(str, NULL, what_wchar_buf, what_out_len, mode, enc);
3118 } else {
3119 if (!hash_initialized) {
3120 hash_initialized = true;
3121 ZVAL_TRUE(&val);
3122 zend_hash_init(&what_ht, what_len, NULL, NULL, false);
3123 }
3124 for (size_t i = 0; i < what_out_len; i++) {
3125 zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
3126 }
3127 }
3128 }
3129
3130 if (UNEXPECTED(!hash_initialized)) {
3131 /* This is only possible if what is empty */
3132 return zend_string_copy(str);
3133 }
3134
3135 zend_string *retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
3136 zend_hash_destroy(&what_ht);
3137
3138 return retval;
3139}
3140
3141static void php_do_mb_trim(INTERNAL_FUNCTION_PARAMETERS, mb_trim_mode mode)
3142{
3143 zend_string *str;
3144 zend_string *what = NULL;
3146
3148 Z_PARAM_STR(str)
3153
3154 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
3155 if (!enc) {
3156 RETURN_THROWS();
3157 }
3158
3159 if (what) {
3160 RETURN_STR(mb_trim_what_chars(str, what, mode, enc));
3161 } else {
3162 RETURN_STR(mb_trim_default_chars(str, mode, enc));
3163 }
3164}
3165
3170
3175
3180
3181static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
3182{
3183 const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
3184 memcpy(ZEND_VOIDP(new_elist), elist, size * sizeof(mbfl_encoding*));
3185 return new_elist;
3186}
3187
3188static unsigned int estimate_demerits(uint32_t w)
3189{
3190 /* Receive wchars decoded from input string using candidate encoding.
3191 * Give the candidate many 'demerits' for each 'rare' codepoint found,
3192 * a smaller number for each ASCII punctuation character, and 1 for
3193 * all other codepoints.
3194 *
3195 * The 'common' codepoints should cover the vast majority of
3196 * codepoints we are likely to see in practice, while only covering
3197 * a small minority of the entire Unicode encoding space. Why?
3198 * Well, if the test string happens to be valid in an incorrect
3199 * candidate encoding, the bogus codepoints which it decodes to will
3200 * be more or less random. By treating the majority of codepoints as
3201 * 'rare', we ensure that in almost all such cases, the bogus
3202 * codepoints will include plenty of 'rares', thus giving the
3203 * incorrect candidate encoding lots of demerits. See
3204 * common_codepoints.txt for the actual list used.
3205 *
3206 * So, why give extra demerits for ASCII punctuation characters? It's
3207 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
3208 * which deliberately only use bytes in the ASCII range. When
3209 * misinterpreted as ASCII/UTF-8, strings in these encodings will
3210 * have an unusually high number of ASCII punctuation characters.
3211 * So giving extra demerits for such characters will improve
3212 * detection accuracy for UTF-7 and similar encodings.
3213 *
3214 * Finally, why 1 demerit for all other characters? That penalizes
3215 * long strings, meaning we will tend to choose a candidate encoding
3216 * in which the test string decodes to a smaller number of
3217 * codepoints. That prevents single-byte encodings in which almost
3218 * every possible input byte decodes to a 'common' codepoint from
3219 * being favored too much. */
3220 if (w > 0xFFFF) {
3221 return 40;
3222 } else if (w >= 0x21 && w <= 0x2F) {
3223 return 6;
3224 } else if ((rare_codepoint_bitvec[w >> 5] >> (w & 0x1F)) & 1) {
3225 return 30;
3226 } else {
3227 return 1;
3228 }
3229 return 0;
3230}
3231
3234 const unsigned char *in;
3235 size_t in_len;
3236 uint64_t demerits; /* Wide bit size to prevent overflow */
3237 unsigned int state;
3239};
3240
3241static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
3242{
3243 size_t j = 0;
3244
3245 for (size_t i = 0; i < length; i++) {
3246 const mbfl_encoding *enc = encodings[i];
3247
3248 array[j].enc = enc;
3249 array[j].state = 0;
3250 array[j].demerits = 0;
3251
3252 /* If any candidate encodings have specialized validation functions, use them
3253 * to eliminate as many candidates as possible */
3254 if (enc->check != NULL) {
3255 for (size_t k = 0; k < n; k++) {
3256 if (!enc->check((unsigned char*)in[k], in_len[k])) {
3257 if (strict) {
3258 goto skip_to_next;
3259 } else {
3260 array[j].demerits += 500;
3261 }
3262 }
3263 }
3264 }
3265
3266 /* This multiplier can optionally be used to make candidate encodings listed
3267 * first more likely to be chosen. It is a weight factor which multiplies
3268 * the number of demerits counted for each candidate. */
3269 array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0;
3270 j++;
3271skip_to_next: ;
3272 }
3273
3274 return j;
3275}
3276
3277static void start_string(struct candidate *array, size_t length, const unsigned char *in, size_t in_len)
3278{
3279 for (size_t i = 0; i < length; i++) {
3280 const mbfl_encoding *enc = array[i].enc;
3281
3282 array[i].in = in;
3283 array[i].in_len = in_len;
3284
3285 /* Skip byte order mark for UTF-8, UTF-16BE, or UTF-16LE */
3286 if (enc == &mbfl_encoding_utf8) {
3287 if (in_len >= 3 && in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xBF) {
3288 array[i].in_len -= 3;
3289 array[i].in += 3;
3290 }
3291 } else if (enc == &mbfl_encoding_utf16be) {
3292 if (in_len >= 2 && in[0] == 0xFE && in[1] == 0xFF) {
3293 array[i].in_len -= 2;
3294 array[i].in += 2;
3295 }
3296 } else if (enc == &mbfl_encoding_utf16le) {
3297 if (in_len >= 2 && in[0] == 0xFF && in[1] == 0xFE) {
3298 array[i].in_len -= 2;
3299 array[i].in += 2;
3300 }
3301 }
3302 }
3303}
3304
3305static size_t count_demerits(struct candidate *array, size_t length, bool strict)
3306{
3307 uint32_t wchar_buf[128];
3308 unsigned int finished = 0; /* For how many candidate encodings have we processed all the input? */
3309
3310 for (size_t i = 0; i < length; i++) {
3311 if (array[i].in_len == 0) {
3312 finished++;
3313 }
3314 }
3315
3316 while ((strict || length > 1) && finished < length) {
3317 /* Iterate in reverse order to avoid moving candidates that can be eliminated. */
3318 for (size_t i = length - 1; i != (size_t)-1; i--) {
3319 /* Do we still have more input to process for this candidate encoding? */
3320 if (array[i].in_len) {
3321 const mbfl_encoding *enc = array[i].enc;
3322 size_t out_len = enc->to_wchar((unsigned char**)&array[i].in, &array[i].in_len, wchar_buf, 128, &array[i].state);
3323 ZEND_ASSERT(out_len <= 128);
3324 /* Check this batch of decoded codepoints; are there any error markers?
3325 * Also sum up the number of demerits */
3326 while (out_len) {
3327 uint32_t w = wchar_buf[--out_len];
3328 if (w == MBFL_BAD_INPUT) {
3329 if (strict) {
3330 /* This candidate encoding is not valid, eliminate it from consideration */
3331 length--;
3332 if (i < length) {
3333 /* The eliminated candidate was the last valid one in the list */
3334 memmove(&array[i], &array[i+1], (length - i) * sizeof(struct candidate));
3335 }
3336 goto try_next_encoding;
3337 } else {
3338 array[i].demerits += 1000;
3339 }
3340 } else {
3341 array[i].demerits += estimate_demerits(w);
3342 }
3343 }
3344 if (array[i].in_len == 0) {
3345 finished++;
3346 }
3347 }
3348try_next_encoding:;
3349 }
3350 }
3351
3352 for (size_t i = 0; i < length; i++) {
3353 double demerits = array[i].demerits * (double) array[i].multiplier;
3354 array[i].demerits = demerits < (double) UINT64_MAX ? (uint64_t) demerits : UINT64_MAX;
3355 }
3356
3357 return length;
3358}
3359
3360MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3361{
3362 if (elist_size == 0) {
3363 return NULL;
3364 }
3365 if (elist_size == 1) {
3366 if (strict) {
3367 while (n--) {
3368 if (!php_mb_check_encoding((const char*)strings[n], str_lengths[n], *elist)) {
3369 return NULL;
3370 }
3371 }
3372 }
3373 return *elist;
3374 }
3375 if (n == 1 && *str_lengths == 0) {
3376 return *elist;
3377 }
3378
3379 /* Allocate on stack; when we return, this array is automatically freed */
3380 struct candidate *array = alloca(elist_size * sizeof(struct candidate));
3381 elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);
3382
3383 while (n--) {
3384 start_string(array, elist_size, strings[n], str_lengths[n]);
3385 elist_size = count_demerits(array, elist_size, strict);
3386 if (elist_size == 0) {
3387 /* All candidates were eliminated */
3388 return NULL;
3389 }
3390 }
3391
3392 /* See which remaining candidate encoding has the least demerits */
3393 unsigned int best = 0;
3394 for (unsigned int i = 1; i < elist_size; i++) {
3395 if (array[i].demerits < array[best].demerits) {
3396 best = i;
3397 }
3398 }
3399 return array[best].enc;
3400}
3401
3402/* When doing 'strict' detection, any string which is invalid in the candidate encoding
3403 * is rejected. With non-strict detection, we just continue, but apply demerits for
3404 * each invalid byte sequence */
3405static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
3406{
3407 return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
3408}
3409
3410/* {{{ Encodings of the given string is returned (as a string) */
3412{
3413 zend_string *str, *encoding_str = NULL;
3414 HashTable *encoding_ht = NULL;
3415 bool strict = false;
3416 const mbfl_encoding *ret, **elist;
3417 size_t size;
3418
3420 Z_PARAM_STR(str)
3422 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(encoding_ht, encoding_str)
3423 Z_PARAM_BOOL(strict)
3425
3426 /* Should we pay attention to the order of the provided candidate encodings and prefer
3427 * the earlier ones (if more than one candidate encoding matches)?
3428 * If the entire list of supported encodings returned by `mb_list_encodings` is passed
3429 * in, then don't treat the order as significant */
3430 bool order_significant = true;
3431
3432 /* make encoding list */
3433 if (encoding_ht) {
3434 if (encoding_ht == MBSTRG(all_encodings_list)) {
3435 order_significant = false;
3436 }
3437 if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
3438 RETURN_THROWS();
3439 }
3440 } else if (encoding_str) {
3441 if (FAILURE == php_mb_parse_encoding_list(ZSTR_VAL(encoding_str), ZSTR_LEN(encoding_str), &elist, &size, /* persistent */ 0, /* arg_num */ 2)) {
3442 RETURN_THROWS();
3443 }
3444 } else {
3447 }
3448
3449 if (size == 0) {
3450 efree(ZEND_VOIDP(elist));
3451 zend_argument_value_error(2, "must specify at least one encoding");
3452 RETURN_THROWS();
3453 }
3454
3455 remove_non_encodings_from_elist(elist, &size);
3456 if (size == 0) {
3457 efree(ZEND_VOIDP(elist));
3459 }
3460
3461 if (ZEND_NUM_ARGS() < 3) {
3462 strict = MBSTRG(strict_detection);
3463 }
3464
3465 if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
3467 } else {
3468 ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
3469 }
3470
3471 efree(ZEND_VOIDP(elist));
3472
3473 if (ret == NULL) {
3475 }
3476
3477 RETVAL_STRING((char *)ret->name);
3478}
3479/* }}} */
3480
3481/* {{{ Returns an array of all supported entity encodings */
3483{
3485
3486 if (MBSTRG(all_encodings_list) == NULL) {
3487 /* Initialize shared array of supported encoding names
3488 * This is done so that we can check if `mb_list_encodings()` is being
3489 * passed to other mbstring functions using a cheap pointer equality check */
3490 HashTable *array = emalloc(sizeof(HashTable));
3491 zend_hash_init(array, 80, NULL, zval_ptr_dtor_str, false);
3492 for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
3493 zval tmp;
3494 ZVAL_STRING(&tmp, (*encodings)->name);
3495 zend_hash_next_index_insert(array, &tmp);
3496 }
3497 MBSTRG(all_encodings_list) = array;
3498 }
3499
3502}
3503/* }}} */
3504
3505/* {{{ Returns an array of the aliases of a given encoding name */
3507{
3508 const mbfl_encoding *encoding;
3509 zend_string *encoding_name = NULL;
3510
3512 Z_PARAM_STR(encoding_name)
3514
3515 encoding = php_mb_get_encoding(encoding_name, 1);
3516 if (!encoding) {
3517 RETURN_THROWS();
3518 }
3519
3521 if (encoding->aliases != NULL) {
3522 for (const char **alias = encoding->aliases; *alias; ++alias) {
3523 add_next_index_string(return_value, (char *)*alias);
3524 }
3525 }
3526}
3527/* }}} */
3528
3529static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
3530{
3531 /* Each wchar may potentially expand to 2 when we perform kana conversion...
3532 * if we are converting zenkaku kana to hankaku kana
3533 * Make the buffer for converted kana big enough that we never need to
3534 * perform bounds checks */
3535 uint32_t wchar_buf[64], converted_buf[64 * 2];
3536 unsigned int buf_offset = 0;
3537 unsigned int state = 0;
3538 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3539 size_t in_len = ZSTR_LEN(input);
3540
3543
3544 while (in_len) {
3545 uint32_t *converted = converted_buf;
3546 /* If one codepoint has been left in wchar_buf[0] to be reprocessed from the
3547 * previous iteration, don't overwrite it */
3548 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + buf_offset, 64 - buf_offset, &state);
3549 out_len += buf_offset;
3550 ZEND_ASSERT(out_len <= 64);
3551
3552 if (!out_len) {
3553 continue;
3554 }
3555
3556 for (size_t i = 0; i < out_len-1; i++) {
3557 uint32_t second = 0;
3558 bool consumed = false;
3559 *converted++ = mb_convert_kana_codepoint(wchar_buf[i], wchar_buf[i+1], &consumed, &second, mode);
3560 if (second) {
3561 *converted++ = second;
3562 }
3563 if (consumed) {
3564 i++;
3565 if (i == out_len-1) {
3566 /* We consumed two codepoints at the very end of the wchar buffer
3567 * So there is nothing remaining to reprocess on the next iteration */
3568 buf_offset = 0;
3569 goto emit_converted_kana;
3570 }
3571 }
3572 }
3573
3574 if (!in_len) {
3575 /* This is the last iteration, so we need to process the final codepoint now */
3576 uint32_t second = 0;
3577 *converted++ = mb_convert_kana_codepoint(wchar_buf[out_len-1], 0, NULL, &second, mode);
3578 if (second) {
3579 *converted++ = second;
3580 }
3581 } else {
3582 /* Reprocess the last codepoint on the next iteration */
3583 wchar_buf[0] = wchar_buf[out_len-1];
3584 buf_offset = 1;
3585 }
3586
3587emit_converted_kana:
3588 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
3589 }
3590
3591 return mb_convert_buf_result(&buf, encoding);
3592}
3593
3595 'A', 'R', 'N', 'S', 'K', 'H', 'M', 'C',
3596 'a', 'r', 'n', 's', 'k', 'h', 'm', 'c',
3597 'V'
3598};
3599
3600/* Conversion between full-width characters and half-width characters (Japanese) */
3602{
3603 unsigned int opt;
3604 char *optstr = NULL;
3605 size_t optstr_len;
3606 zend_string *encname = NULL, *str;
3607
3609 Z_PARAM_STR(str)
3611 Z_PARAM_STRING(optstr, optstr_len)
3612 Z_PARAM_STR_OR_NULL(encname)
3614
3615 if (optstr != NULL) {
3616 char *p = optstr, *e = p + optstr_len;
3617 opt = 0;
3618next_option:
3619 while (p < e) {
3620 /* Walk through option string and convert to bit vector
3621 * See translit_kana_jisx0201_jisx0208.h for the values used */
3622 char c = *p++;
3623 if (c == 'A') {
3625 } else if (c == 'a') {
3627 } else {
3628 for (size_t i = 0; i < sizeof(mb_convert_kana_flags) / sizeof(char); i++) {
3629 if (c == mb_convert_kana_flags[i]) {
3630 opt |= (1 << i);
3631 goto next_option;
3632 }
3633 }
3634
3635 zend_argument_value_error(2, "contains invalid flag: '%c'", c);
3636 RETURN_THROWS();
3637 }
3638 }
3639
3640 /* Check for illegal combinations of options */
3641 if (((opt & 0xFF00) >> 8) & opt) {
3642 /* It doesn't make sense to convert the same type of characters from halfwidth to
3643 * fullwidth and then back to halfwidth again. Neither does it make sense to convert
3644 * FW hiragana to FW katakana and then back again. */
3645 int badflag = ((opt & 0xFF00) >> 8) & opt, i;
3646 for (i = 0; (badflag & 1) == 0; badflag >>= 1, i++);
3647 char flag1 = mb_convert_kana_flags[i], flag2 = mb_convert_kana_flags[i+8];
3648 if ((flag1 == 'R' || flag1 == 'N') && (opt & MBFL_HAN2ZEN_ALL))
3649 flag1 = 'A';
3650 if ((flag2 == 'r' || flag2 == 'n') && (opt & MBFL_ZEN2HAN_ALL))
3651 flag2 = 'a';
3652 zend_argument_value_error(2, "must not combine '%c' and '%c' flags", flag1, flag2);
3653 RETURN_THROWS();
3654 }
3655
3656 if ((opt & MBFL_HAN2ZEN_HIRAGANA) && (opt & MBFL_HAN2ZEN_KATAKANA)) {
3657 /* We can either convert all HW kana to FW hiragana, or to FW katakana, but not both */
3658 zend_argument_value_error(2, "must not combine 'H' and 'K' flags");
3659 RETURN_THROWS();
3660 }
3661
3662 /* We can either convert all FW kana to HW hiragana, or all FW kana to HW katakana,
3663 * or all FW hiragana to FW katakana, or all FW katakana to FW hiragana, but not
3664 * more than one of these */
3665 if (opt & MBFL_ZEN2HAN_HIRAGANA) {
3666 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3667 zend_argument_value_error(2, "must not combine 'h' and 'C' flags");
3668 RETURN_THROWS();
3669 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3670 zend_argument_value_error(2, "must not combine 'h' and 'c' flags");
3671 RETURN_THROWS();
3672 }
3673 } else if (opt & MBFL_ZEN2HAN_KATAKANA) {
3674 if (opt & MBFL_ZENKAKU_HIRA2KATA) {
3675 zend_argument_value_error(2, "must not combine 'k' and 'C' flags");
3676 RETURN_THROWS();
3677 } else if (opt & MBFL_ZENKAKU_KATA2HIRA) {
3678 zend_argument_value_error(2, "must not combine 'k' and 'c' flags");
3679 RETURN_THROWS();
3680 }
3681 }
3682 } else {
3684 }
3685
3686 const mbfl_encoding *enc = php_mb_get_encoding(encname, 3);
3687 if (!enc) {
3688 RETURN_THROWS();
3689 }
3690
3691 RETVAL_STR(jp_kana_convert(str, enc, opt));
3692}
3693
3694static unsigned int mb_recursive_count_strings(zval *var)
3695{
3696 unsigned int count = 0;
3697 ZVAL_DEREF(var);
3698
3699 if (Z_TYPE_P(var) == IS_STRING) {
3700 count++;
3701 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3702 if (Z_REFCOUNTED_P(var)) {
3703 if (Z_IS_RECURSIVE_P(var)) {
3704 return count;
3705 }
3707 }
3708
3709 HashTable *ht = HASH_OF(var);
3710 if (ht != NULL) {
3711 zval *entry;
3713 count += mb_recursive_count_strings(entry);
3715 }
3716
3717 if (Z_REFCOUNTED_P(var)) {
3719 }
3720 }
3721
3722 return count;
3723}
3724
3725static bool mb_recursive_find_strings(zval *var, const unsigned char **val_list, size_t *len_list, unsigned int *count)
3726{
3727 ZVAL_DEREF(var);
3728
3729 if (Z_TYPE_P(var) == IS_STRING) {
3730 val_list[*count] = (const unsigned char*)Z_STRVAL_P(var);
3731 len_list[*count] = Z_STRLEN_P(var);
3732 (*count)++;
3733 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3734 if (Z_REFCOUNTED_P(var)) {
3735 if (Z_IS_RECURSIVE_P(var)) {
3736 return true;
3737 }
3739 }
3740
3741 HashTable *ht = HASH_OF(var);
3742 if (ht != NULL) {
3743 zval *entry;
3745 if (mb_recursive_find_strings(entry, val_list, len_list, count)) {
3746 if (Z_REFCOUNTED_P(var)) {
3748 return true;
3749 }
3750 }
3752 }
3753
3754 if (Z_REFCOUNTED_P(var)) {
3756 }
3757 }
3758
3759 return false;
3760}
3761
3762static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
3763{
3764 zval *entry, *orig_var;
3765
3766 orig_var = var;
3767 ZVAL_DEREF(var);
3768
3769 if (Z_TYPE_P(var) == IS_STRING) {
3770 zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
3771 zval_ptr_dtor(orig_var);
3772 ZVAL_STR(orig_var, ret);
3773 } else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
3774 if (Z_TYPE_P(var) == IS_ARRAY) {
3775 SEPARATE_ARRAY(var);
3776 }
3777 if (Z_REFCOUNTED_P(var)) {
3778 if (Z_IS_RECURSIVE_P(var)) {
3779 return true;
3780 }
3782 }
3783
3784 HashTable *ht = HASH_OF(var);
3785 if (ht != NULL) {
3786 ZEND_HASH_FOREACH_VAL(ht, entry) {
3787 /* Can be a typed property declaration, in which case we need to remove the reference from the source list.
3788 * Just using ZEND_TRY_ASSIGN_STRINGL is not sufficient because that would not unwrap the reference
3789 * and change values through references (see bug #26639). */
3790 if (Z_TYPE_P(entry) == IS_INDIRECT) {
3792
3793 entry = Z_INDIRECT_P(entry);
3794 if (Z_ISREF_P(entry) && Z_TYPE_P(Z_REFVAL_P(entry)) == IS_STRING) {
3795 zend_property_info *info = zend_get_typed_property_info_for_slot(Z_OBJ_P(var), entry);
3796 if (info) {
3797 ZEND_REF_DEL_TYPE_SOURCE(Z_REF_P(entry), info);
3798 }
3799 }
3800 }
3801
3802 if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
3803 if (Z_REFCOUNTED_P(var)) {
3805 }
3806 return true;
3807 }
3809 }
3810
3811 if (Z_REFCOUNTED_P(var)) {
3813 }
3814 }
3815
3816 return false;
3817}
3818
3820{
3821 zval *args;
3822 zend_string *to_enc_str;
3823 zend_string *from_enc_str;
3824 HashTable *from_enc_ht;
3825 const mbfl_encoding *from_encoding, *to_encoding;
3826 uint32_t argc;
3827 size_t elistsz;
3828 const mbfl_encoding **elist;
3829
3831 Z_PARAM_STR(to_enc_str)
3832 Z_PARAM_ARRAY_HT_OR_STR(from_enc_ht, from_enc_str)
3833 Z_PARAM_VARIADIC('+', args, argc)
3835
3836 /* new encoding */
3837 to_encoding = php_mb_get_encoding(to_enc_str, 1);
3838 if (!to_encoding) {
3839 RETURN_THROWS();
3840 }
3841
3842 from_encoding = MBSTRG(current_internal_encoding);
3843
3844 bool order_significant = true;
3845
3846 /* pre-conversion encoding */
3847 if (from_enc_ht) {
3848 if (from_enc_ht == MBSTRG(all_encodings_list)) {
3849 /* If entire list of supported encodings returned by `mb_list_encodings` is passed
3850 * in, then don't treat the order of the list as significant */
3851 order_significant = false;
3852 }
3853 if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
3854 RETURN_THROWS();
3855 }
3856 } else {
3857 if (php_mb_parse_encoding_list(ZSTR_VAL(from_enc_str), ZSTR_LEN(from_enc_str), &elist, &elistsz, /* persistent */ 0, /* arg_num */ 2) == FAILURE) {
3858 RETURN_THROWS();
3859 }
3860 }
3861
3862 if (elistsz == 0) {
3863 efree(ZEND_VOIDP(elist));
3864 zend_argument_value_error(2, "must specify at least one encoding");
3865 RETURN_THROWS();
3866 }
3867
3868 if (elistsz == 1) {
3869 from_encoding = *elist;
3870 } else {
3871 /* auto detect */
3872 unsigned int num = 0;
3873 for (size_t n = 0; n < argc; n++) {
3874 zval *zv = &args[n];
3875 num += mb_recursive_count_strings(zv);
3876 }
3877 const unsigned char **val_list = (const unsigned char**)ecalloc(num, sizeof(char *));
3878 size_t *len_list = (size_t*)ecalloc(num, sizeof(size_t));
3879 unsigned int i = 0;
3880 for (size_t n = 0; n < argc; n++) {
3881 zval *zv = &args[n];
3882 if (mb_recursive_find_strings(zv, val_list, len_list, &i)) {
3883 efree(ZEND_VOIDP(elist));
3884 efree(ZEND_VOIDP(val_list));
3885 efree(len_list);
3886 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3888 }
3889 }
3890 from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
3891 efree(ZEND_VOIDP(val_list));
3892 efree(len_list);
3893 if (!from_encoding) {
3894 php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
3895 efree(ZEND_VOIDP(elist));
3897 }
3898
3899 }
3900
3901 efree(ZEND_VOIDP(elist));
3902
3903 /* convert */
3904 for (size_t n = 0; n < argc; n++) {
3905 zval *zv = &args[n];
3906 ZVAL_DEREF(zv);
3907 if (mb_recursive_convert_variable(zv, from_encoding, to_encoding)) {
3908 php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
3910 }
3911 }
3912
3913 RETURN_STRING(from_encoding->name);
3914}
3915
3916/* HTML numeric entities */
3917
3918/* Convert PHP array to data structure required by mbfl_html_numeric_entity */
3919static uint32_t *make_conversion_map(HashTable *target_hash, size_t *conversion_map_size)
3920{
3921 zval *hash_entry;
3922
3923 size_t n_elems = *conversion_map_size = zend_hash_num_elements(target_hash);
3924 if (n_elems % 4 != 0) {
3925 zend_argument_value_error(2, "must have a multiple of 4 elements");
3926 return NULL;
3927 }
3928
3929 uint32_t *convmap = (uint32_t*)safe_emalloc(n_elems, sizeof(uint32_t), 0);
3930 uint32_t *mapelm = convmap;
3931
3932 ZEND_HASH_FOREACH_VAL(target_hash, hash_entry) {
3933 bool failed = true;
3934 zend_long tmp = zval_try_get_long(hash_entry, &failed);
3935 if (failed) {
3936 efree(convmap);
3937 zend_argument_value_error(2, "must only be composed of values of type int");
3938 return NULL;
3939 }
3940 *mapelm++ = tmp;
3942
3943 return convmap;
3944}
3945
3946static bool html_numeric_entity_convert(uint32_t w, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
3947{
3948 uint32_t *convmap_end = convmap + conversion_map_size;
3949
3950 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
3951 uint32_t lo_code = mapelm[0];
3952 uint32_t hi_code = mapelm[1];
3953 uint32_t offset = mapelm[2];
3954 uint32_t mask = mapelm[3];
3955
3956 if (w >= lo_code && w <= hi_code) {
3957 /* This wchar falls inside one of the ranges which should be
3958 * converted to HTML entities */
3959 *retval = (w + offset) & mask;
3960 return true;
3961 }
3962 }
3963
3964 /* None of the ranges matched */
3965 return false;
3966}
3967
3968static zend_string* html_numeric_entity_encode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size, bool hex)
3969{
3970 /* Each wchar which we get from decoding the input string may become up to
3971 * 13 wchars when we convert it to an HTML entity */
3972 uint32_t wchar_buf[32], converted_buf[32 * 13];
3973 unsigned char entity[16]; /* For converting wchars to hex/decimal string */
3974
3975 unsigned int state = 0;
3976 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
3977 size_t in_len = ZSTR_LEN(input);
3978
3981
3982 while (in_len) {
3983 /* Convert input string to wchars, up to 32 at a time */
3984 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 32, &state);
3985 ZEND_ASSERT(out_len <= 32);
3986 uint32_t *converted = converted_buf;
3987
3988 /* Run through wchars and see if any of them fall into the ranges
3989 * which we want to convert to HTML entities */
3990 for (size_t i = 0; i < out_len; i++) {
3991 uint32_t w = wchar_buf[i];
3992
3993 if (html_numeric_entity_convert(w, convmap, conversion_map_size, &w)) {
3994 *converted++ = '&';
3995 *converted++ = '#';
3996 if (hex) {
3997 *converted++ = 'x';
3998 }
3999
4000 /* Convert wchar to decimal/hex string */
4001 if (w == 0) {
4002 *converted++ = '0';
4003 } else {
4004 unsigned char *p = entity + sizeof(entity);
4005 if (hex) {
4006 while (w > 0) {
4007 *(--p) = "0123456789ABCDEF"[w & 0xF];
4008 w >>= 4;
4009 }
4010 } else {
4011 while (w > 0) {
4012 *(--p) = "0123456789"[w % 10];
4013 w /= 10;
4014 }
4015 }
4016 while (p < entity + sizeof(entity)) {
4017 *converted++ = *p++;
4018 }
4019 }
4020
4021 *converted++ = ';';
4022 } else {
4023 *converted++ = w;
4024 }
4025 }
4026
4027 ZEND_ASSERT(converted <= converted_buf + sizeof(converted_buf)/sizeof(*converted_buf));
4028 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4029 }
4030
4031 return mb_convert_buf_result(&buf, encoding);
4032}
4033
4034/* {{{ Converts specified characters to HTML numeric entities */
4036{
4037 zend_string *encoding = NULL, *str;
4038 size_t conversion_map_size;
4039 HashTable *target_hash;
4040 bool is_hex = false;
4041
4043 Z_PARAM_STR(str)
4044 Z_PARAM_ARRAY_HT(target_hash)
4047 Z_PARAM_BOOL(is_hex)
4049
4050 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4051 if (!enc) {
4052 RETURN_THROWS();
4053 }
4054
4055 uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4056 if (convmap == NULL) {
4057 RETURN_THROWS();
4058 }
4059
4060 RETVAL_STR(html_numeric_entity_encode(str, enc, convmap, conversion_map_size, is_hex));
4061 efree(convmap);
4062}
4063/* }}} */
4064
4065static bool html_numeric_entity_deconvert(uint32_t number, uint32_t *convmap, size_t conversion_map_size, uint32_t *retval)
4066{
4067 uint32_t *convmap_end = convmap + conversion_map_size;
4068
4069 for (uint32_t *mapelm = convmap; mapelm < convmap_end; mapelm += 4) {
4070 uint32_t lo_code = mapelm[0];
4071 uint32_t hi_code = mapelm[1];
4072 uint32_t offset = mapelm[2];
4073 uint32_t codepoint = number - offset;
4074 if (codepoint >= lo_code && codepoint <= hi_code) {
4075 *retval = codepoint;
4076 return true;
4077 }
4078 }
4079
4080 return false;
4081}
4082
4083#define DEC_ENTITY_MINLEN 3 /* For "&#" and 1 decimal digit */
4084#define HEX_ENTITY_MINLEN 4 /* For "&#x" and 1 hexadecimal digit */
4085#define DEC_ENTITY_MAXLEN 12 /* For "&#" and 10 decimal digits */
4086#define HEX_ENTITY_MAXLEN 11 /* For "&#x" and 8 hexadecimal digits */
4087
4088static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_encoding *encoding, uint32_t *convmap, size_t conversion_map_size)
4089{
4090 uint32_t wchar_buf[128], converted_buf[128];
4091
4092 unsigned int state = 0;
4093 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
4094 size_t in_len = ZSTR_LEN(input);
4095
4098
4099 /* Decode input string from bytes to wchars one 128-wchar buffer at a time, then deconvert HTML entities,
4100 * copying the deconverted wchars to a second buffer, then convert back to original encoding from the
4101 * 2nd 'converted' buffer.
4102 *
4103 * Tricky part: an HTML entity might be truncated at the end of the wchar buffer; the remaining
4104 * part could come in the next buffer of wchars. To deal with this problem, when we find what looks
4105 * like an HTML entity, we scan to see if it terminates before the end of the wchar buffer or not.
4106 * If not, we copy it to the beginning of the wchar buffer, and tell the input conversion routine
4107 * to store the next batch of wchars after it.
4108 *
4109 * Optimization: Scan for &, and if we don't find it anywhere, don't even bother copying the
4110 * wchars from the 1st buffer to the 2nd one.
4111 *
4112 * 'converted_buf' is big enough that the deconverted wchars will *always* fit in it, so we don't
4113 * have to do bounds checks when writing wchars into it.
4114 */
4115
4116 unsigned int wchar_buf_offset = 0;
4117
4118 while (in_len) {
4119 /* Leave space for sentinel at the end of the buffer */
4120 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf + wchar_buf_offset, 127 - wchar_buf_offset, &state);
4121 out_len += wchar_buf_offset;
4122 ZEND_ASSERT(out_len <= 127);
4123 wchar_buf[out_len] = '&'; /* Sentinel, to avoid bounds checks */
4124
4125 uint32_t *p, *converted;
4126
4127 /* Scan for & first; however, if `wchar_buf_offset` > 0, then definitely & will
4128 * be there (in `wchar_buf[0]`), so don't bother in that case */
4129 if (wchar_buf_offset == 0) {
4130 p = wchar_buf;
4131 while (*p != '&')
4132 p++;
4133 if (p == wchar_buf + out_len) {
4134 /* No HTML entities in this buffer */
4135 encoding->from_wchar(wchar_buf, out_len, &buf, !in_len);
4136 continue;
4137 }
4138
4139 /* Copy over the prefix with no & which we already scanned */
4140 memcpy(converted_buf, wchar_buf, (p - wchar_buf) * 4);
4141 converted = converted_buf + (p - wchar_buf);
4142 } else {
4143 p = wchar_buf;
4144 converted = converted_buf;
4145 }
4146
4147found_ampersand:
4148 ZEND_ASSERT(*p == '&');
4149 uint32_t *p2 = p;
4150
4151 /* These tests can't overrun end of buffer, because we have a '&' sentinel there */
4152 if (*++p2 == '#') {
4153 if (*++p2 == 'x') {
4154 /* Possible hex entity */
4155 uint32_t w = *++p2;
4156 while ((w >= '0' && w <= '9') || (w >= 'A' && w <= 'F') || (w >= 'a' && w <= 'f'))
4157 w = *++p2;
4158 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= HEX_ENTITY_MAXLEN) {
4159 /* We hit the end of the buffer while reading digits, and
4160 * more wchars are still coming in the next buffer
4161 * Reprocess this identity on next iteration */
4162 memmove(wchar_buf, p, (p2 - p) * 4);
4163 wchar_buf_offset = p2 - p;
4164 goto process_converted_wchars;
4165 } else if ((p2 - p) < HEX_ENTITY_MINLEN || (p2 - p) > HEX_ENTITY_MAXLEN) {
4166 /* Invalid entity (too long or "&#x" only) */
4167 memcpy(converted, p, (p2 - p) * 4);
4168 converted += p2 - p;
4169 } else {
4170 /* Valid hexadecimal entity */
4171 uint32_t value = 0, *p3 = p + 3;
4172 while (p3 < p2) {
4173 w = *p3++;
4174 if (w <= '9') {
4175 value = (value * 16) + (w - '0');
4176 } else if (w >= 'a') {
4177 value = (value * 16) + 10 + (w - 'a');
4178 } else {
4179 value = (value * 16) + 10 + (w - 'A');
4180 }
4181 }
4182 if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4183 converted++;
4184 if (*p2 == ';')
4185 p2++;
4186 } else {
4187 memcpy(converted, p, (p2 - p) * 4);
4188 converted += p2 - p;
4189 }
4190 }
4191 } else {
4192 /* Possible decimal entity */
4193 uint32_t w = *p2;
4194 while (w >= '0' && w <= '9')
4195 w = *++p2;
4196 if ((p2 == wchar_buf + out_len) && in_len && (p2 - p) <= DEC_ENTITY_MAXLEN) {
4197 /* The number of digits was legal (no more than 10 decimal digits)
4198 * Reprocess this identity on next iteration of main loop */
4199 memmove(wchar_buf, p, (p2 - p) * 4);
4200 wchar_buf_offset = p2 - p;
4201 goto process_converted_wchars;
4202 } else if ((p2 - p) < DEC_ENTITY_MINLEN || (p2 - p) > DEC_ENTITY_MAXLEN) {
4203 /* Invalid entity (too long or "&#" only) */
4204 memcpy(converted, p, (p2 - p) * 4);
4205 converted += p2 - p;
4206 } else {
4207 /* Valid decimal entity */
4208 uint32_t value = 0, *p3 = p + 2;
4209 while (p3 < p2) {
4210 /* If unsigned integer overflow would occur in the below
4211 * multiplication by 10, this entity is no good
4212 * 0x19999999 is 1/10th of 0xFFFFFFFF */
4213 if (value > 0x19999999) {
4214 memcpy(converted, p, (p2 - p) * 4);
4215 converted += p2 - p;
4216 goto decimal_entity_too_big;
4217 }
4218 value = (value * 10) + (*p3++ - '0');
4219 }
4220 if (html_numeric_entity_deconvert(value, convmap, conversion_map_size, converted)) {
4221 converted++;
4222 if (*p2 == ';')
4223 p2++;
4224 } else {
4225 memcpy(converted, p, (p2 - p) * 4);
4226 converted += p2 - p;
4227 }
4228 }
4229 }
4230 } else if ((p2 == wchar_buf + out_len) && in_len) {
4231 /* Corner case: & at end of buffer */
4232 wchar_buf[0] = '&';
4233 wchar_buf_offset = 1;
4234 goto process_converted_wchars;
4235 } else {
4236 *converted++ = '&';
4237 }
4238decimal_entity_too_big:
4239
4240 /* Starting to scan a new section of the wchar buffer
4241 * 'p2' is pointing at the next wchar which needs to be processed */
4242 p = p2;
4243 while (*p2 != '&')
4244 p2++;
4245
4246 if (p2 > p) {
4247 memcpy(converted, p, (p2 - p) * 4);
4248 converted += p2 - p;
4249 p = p2;
4250 }
4251
4252 if (p < wchar_buf + out_len)
4253 goto found_ampersand;
4254
4255 /* We do not have any wchars remaining at the end of this buffer which
4256 * we need to reprocess on the next call */
4257 wchar_buf_offset = 0;
4258process_converted_wchars:
4259 ZEND_ASSERT(converted <= converted_buf + 128);
4260 encoding->from_wchar(converted_buf, converted - converted_buf, &buf, !in_len);
4261 }
4262
4263 return mb_convert_buf_result(&buf, encoding);
4264}
4265
4266/* {{{ Converts HTML numeric entities to character code */
4268{
4269 zend_string *encoding = NULL, *str;
4270 size_t conversion_map_size;
4271 HashTable *target_hash;
4272
4274 Z_PARAM_STR(str)
4275 Z_PARAM_ARRAY_HT(target_hash)
4279
4280 const mbfl_encoding *enc = php_mb_get_encoding(encoding, 3);
4281 if (!enc) {
4282 RETURN_THROWS();
4283 }
4284
4285 uint32_t *convmap = make_conversion_map(target_hash, &conversion_map_size);
4286 if (convmap == NULL) {
4287 RETURN_THROWS();
4288 }
4289
4290 RETVAL_STR(html_numeric_entity_decode(str, enc, convmap, conversion_map_size));
4291 efree(convmap);
4292}
4293/* }}} */
4294
4295/* {{{ Sends an email message with MIME scheme */
4296#define CRLF "\r\n"
4297
4298static int _php_mbstr_parse_mail_headers(HashTable *ht, const char *str, size_t str_len)
4299{
4300 const char *ps;
4301 size_t icnt;
4302 int state = 0;
4303 int crlf_state = -1;
4304 char *token = NULL;
4305 size_t token_pos = 0;
4306 zend_string *fld_name, *fld_val;
4307
4308 ps = str;
4309 icnt = str_len;
4310 fld_name = fld_val = NULL;
4311
4312 /*
4313 * C o n t e n t - T y p e : t e x t / h t m l \r\n
4314 * ^ ^^^^^^^^^^^^^^^^^^^^^ ^^^ ^^^^^^^^^^^^^^^^^ ^^^^
4315 * state 0 1 2 3
4316 *
4317 * C o n t e n t - T y p e : t e x t / h t m l \r\n
4318 * ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^
4319 * crlf_state -1 0 1 -1
4320 *
4321 */
4322
4323 while (icnt > 0) {
4324 switch (*ps) {
4325 case ':':
4326 if (crlf_state == 1) {
4327 token_pos++;
4328 }
4329
4330 if (state == 0 || state == 1) {
4331 if(token && token_pos > 0) {
4332 fld_name = zend_string_init(token, token_pos, 0);
4333 }
4334 state = 2;
4335 } else {
4336 token_pos++;
4337 }
4338
4339 crlf_state = 0;
4340 break;
4341
4342 case '\n':
4343 if (crlf_state == -1) {
4344 goto out;
4345 }
4346 crlf_state = -1;
4347 break;
4348
4349 case '\r':
4350 if (crlf_state == 1) {
4351 token_pos++;
4352 } else {
4353 crlf_state = 1;
4354 }
4355 break;
4356
4357 case ' ': case '\t':
4358 if (crlf_state == -1) {
4359 if (state == 3) {
4360 /* continuing from the previous line */
4361 state = 4;
4362 } else {
4363 /* simply skipping this new line */
4364 state = 5;
4365 }
4366 } else {
4367 if (crlf_state == 1) {
4368 token_pos++;
4369 }
4370 if (state == 1 || state == 3) {
4371 token_pos++;
4372 }
4373 }
4374 crlf_state = 0;
4375 break;
4376
4377 default:
4378 switch (state) {
4379 case 0:
4380 token = (char*)ps;
4381 token_pos = 0;
4382 state = 1;
4383 break;
4384
4385 case 2:
4386 if (crlf_state != -1) {
4387 token = (char*)ps;
4388 token_pos = 0;
4389
4390 state = 3;
4391 break;
4392 }
4394
4395 case 3:
4396 if (crlf_state == -1) {
4397 if(token && token_pos > 0) {
4398 fld_val = zend_string_init(token, token_pos, 0);
4399 }
4400
4401 if (fld_name != NULL && fld_val != NULL) {
4402 zval val;
4403 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4404 ZVAL_STR(&val, fld_val);
4405
4406 zend_hash_update(ht, fld_name, &val);
4407
4408 zend_string_release_ex(fld_name, 0);
4409 }
4410
4411 fld_name = fld_val = NULL;
4412 token = (char*)ps;
4413 token_pos = 0;
4414
4415 state = 1;
4416 }
4417 break;
4418
4419 case 4:
4420 token_pos++;
4421 state = 3;
4422 break;
4423 }
4424
4425 if (crlf_state == 1) {
4426 token_pos++;
4427 }
4428
4429 token_pos++;
4430
4431 crlf_state = 0;
4432 break;
4433 }
4434 ps++, icnt--;
4435 }
4436out:
4437 if (state == 2) {
4438 token = "";
4439 token_pos = 0;
4440
4441 state = 3;
4442 }
4443 if (state == 3) {
4444 if(token && token_pos > 0) {
4445 fld_val = zend_string_init(token, token_pos, 0);
4446 }
4447 if (fld_name != NULL && fld_val != NULL) {
4448 zval val;
4449 zend_str_tolower(ZSTR_VAL(fld_name), ZSTR_LEN(fld_name));
4450 ZVAL_STR(&val, fld_val);
4451 zend_hash_update(ht, fld_name, &val);
4452
4453 zend_string_release_ex(fld_name, 0);
4454 }
4455 }
4456 return state;
4457}
4458
4460{
4461 char *to;
4462 size_t to_len;
4463 char *message;
4464 size_t message_len;
4465 zend_string *subject;
4466 zend_string *extra_cmd = NULL;
4467 HashTable *headers_ht = NULL;
4468 zend_string *str_headers = NULL;
4469 size_t i;
4470 char *to_r = NULL;
4471 bool suppress_content_type = false;
4472 bool suppress_content_transfer_encoding = false;
4473
4474 char *p;
4475 enum mbfl_no_encoding;
4476 const mbfl_encoding *tran_cs, /* transfer text charset */
4477 *head_enc, /* header transfer encoding */
4478 *body_enc; /* body transfer encoding */
4479 const mbfl_language *lang;
4480 HashTable ht_headers;
4481 zval *s;
4482
4483 /* character-set, transfer-encoding */
4484 tran_cs = &mbfl_encoding_utf8;
4485 head_enc = &mbfl_encoding_base64;
4486 body_enc = &mbfl_encoding_base64;
4488 if (lang != NULL) {
4489 tran_cs = mbfl_no2encoding(lang->mail_charset);
4490 head_enc = mbfl_no2encoding(lang->mail_header_encoding);
4491 body_enc = mbfl_no2encoding(lang->mail_body_encoding);
4492 }
4493
4495 Z_PARAM_PATH(to, to_len)
4496 Z_PARAM_PATH_STR(subject)
4497 Z_PARAM_PATH(message, message_len)
4499 Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
4500 Z_PARAM_PATH_STR_OR_NULL(extra_cmd)
4502
4503 if (str_headers) {
4504 if (strlen(ZSTR_VAL(str_headers)) != ZSTR_LEN(str_headers)) {
4505 zend_argument_value_error(4, "must not contain any null bytes");
4506 RETURN_THROWS();
4507 }
4508 str_headers = php_trim(str_headers, NULL, 0, 2);
4509 } else if (headers_ht) {
4510 str_headers = php_mail_build_headers(headers_ht);
4511 if (EG(exception)) {
4512 RETURN_THROWS();
4513 }
4514 }
4515
4516 zend_hash_init(&ht_headers, 0, NULL, ZVAL_PTR_DTOR, 0);
4517
4518 if (str_headers != NULL) {
4519 _php_mbstr_parse_mail_headers(&ht_headers, ZSTR_VAL(str_headers), ZSTR_LEN(str_headers));
4520 }
4521
4522 if ((s = zend_hash_str_find(&ht_headers, "content-type", sizeof("content-type") - 1))) {
4523 char *tmp;
4524 char *param_name;
4525 char *charset = NULL;
4526
4528 p = strchr(Z_STRVAL_P(s), ';');
4529
4530 if (p != NULL) {
4531 /* skipping the padded spaces */
4532 do {
4533 ++p;
4534 } while (*p == ' ' || *p == '\t');
4535
4536 if (*p != '\0') {
4537 if ((param_name = php_strtok_r(p, "= ", &tmp)) != NULL) {
4538 if (strcasecmp(param_name, "charset") == 0) {
4539 const mbfl_encoding *_tran_cs = tran_cs;
4540
4541 charset = php_strtok_r(NULL, "= \"", &tmp);
4542 if (charset != NULL) {
4543 _tran_cs = mbfl_name2encoding(charset);
4544 }
4545
4546 if (!_tran_cs) {
4547 php_error_docref(NULL, E_WARNING, "Unsupported charset \"%s\" - will be regarded as ascii", charset);
4548 _tran_cs = &mbfl_encoding_ascii;
4549 }
4550 tran_cs = _tran_cs;
4551 }
4552 }
4553 }
4554 }
4555 suppress_content_type = true;
4556 }
4557
4558 if ((s = zend_hash_str_find(&ht_headers, "content-transfer-encoding", sizeof("content-transfer-encoding") - 1))) {
4559 const mbfl_encoding *_body_enc;
4560
4562 _body_enc = mbfl_name2encoding(Z_STRVAL_P(s));
4563 switch (_body_enc ? _body_enc->no_encoding : mbfl_no_encoding_invalid) {
4567 body_enc = _body_enc;
4568 break;
4569
4570 default:
4571 php_error_docref(NULL, E_WARNING, "Unsupported transfer encoding \"%s\" - will be regarded as 8bit", Z_STRVAL_P(s));
4572 body_enc = &mbfl_encoding_8bit;
4573 break;
4574 }
4575 suppress_content_transfer_encoding = true;
4576 }
4577
4578 /* To: */
4579 if (to_len > 0) {
4580 to_r = estrndup(to, to_len);
4581 for (; to_len; to_len--) {
4582 if (!isspace((unsigned char) to_r[to_len - 1])) {
4583 break;
4584 }
4585 to_r[to_len - 1] = '\0';
4586 }
4587 for (i = 0; to_r[i]; i++) {
4588 if (iscntrl((unsigned char) to_r[i])) {
4589 /* According to RFC 822, section 3.1.1 long headers may be separated into
4590 * parts using CRLF followed at least one linear-white-space character ('\t' or ' ').
4591 * To prevent these separators from being replaced with a space, we skip over them. */
4592 if (to_r[i] == '\r' && to_r[i + 1] == '\n' && (to_r[i + 2] == ' ' || to_r[i + 2] == '\t')) {
4593 i += 2;
4594 while (to_r[i + 1] == ' ' || to_r[i + 1] == '\t') {
4595 i++;
4596 }
4597 continue;
4598 }
4599
4600 to_r[i] = ' ';
4601 }
4602 }
4603 } else {
4604 to_r = to;
4605 }
4606
4607 /* Subject: */
4609 if (enc == &mbfl_encoding_pass) {
4610 enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4611 }
4612 const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
4613 size_t line_sep_len = strlen(line_sep);
4614
4615 subject = mb_mime_header_encode(subject, enc, tran_cs, head_enc == &mbfl_encoding_base64, (char*)line_sep, line_sep_len, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
4616
4617 /* message body */
4619 if (msg_enc == &mbfl_encoding_pass) {
4620 msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
4621 }
4622
4623 unsigned int num_errors = 0;
4624 zend_string *tmpstr = mb_fast_convert((unsigned char*)message, message_len, msg_enc, tran_cs, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4625 zend_string *conv = mb_fast_convert((unsigned char*)ZSTR_VAL(tmpstr), ZSTR_LEN(tmpstr), &mbfl_encoding_8bit, body_enc, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR, &num_errors);
4626 zend_string_free(tmpstr);
4627 message = ZSTR_VAL(conv);
4628
4629 /* other headers */
4630#define PHP_MBSTR_MAIL_MIME_HEADER1 "MIME-Version: 1.0"
4631#define PHP_MBSTR_MAIL_MIME_HEADER2 "Content-Type: text/plain"
4632#define PHP_MBSTR_MAIL_MIME_HEADER3 "; charset="
4633#define PHP_MBSTR_MAIL_MIME_HEADER4 "Content-Transfer-Encoding: "
4634
4635 smart_str str = {0};
4636 bool empty = true;
4637
4638 if (str_headers != NULL && ZSTR_LEN(str_headers) > 0) {
4639 /* Strip trailing CRLF from `str_headers`; we will add CRLF back if necessary */
4640 size_t len = ZSTR_LEN(str_headers);
4641 if (ZSTR_VAL(str_headers)[len-1] == '\n') {
4642 len--;
4643 }
4644 if (ZSTR_VAL(str_headers)[len-1] == '\r') {
4645 len--;
4646 }
4647 smart_str_appendl(&str, ZSTR_VAL(str_headers), len);
4648 empty = false;
4649 zend_string_release_ex(str_headers, 0);
4650 }
4651
4652 if (!zend_hash_str_exists(&ht_headers, "mime-version", sizeof("mime-version") - 1)) {
4653 if (!empty) {
4654 smart_str_appendl(&str, line_sep, line_sep_len);
4655 }
4656 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER1, sizeof(PHP_MBSTR_MAIL_MIME_HEADER1) - 1);
4657 empty = false;
4658 }
4659
4660 if (!suppress_content_type) {
4661 if (!empty) {
4662 smart_str_appendl(&str, line_sep, line_sep_len);
4663 }
4664 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER2, sizeof(PHP_MBSTR_MAIL_MIME_HEADER2) - 1);
4665
4666 p = (char *)mbfl_encoding_preferred_mime_name(tran_cs);
4667 if (p != NULL) {
4668 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER3, sizeof(PHP_MBSTR_MAIL_MIME_HEADER3) - 1);
4669 smart_str_appends(&str, p);
4670 }
4671 empty = false;
4672 }
4673
4674 if (!suppress_content_transfer_encoding) {
4675 if (!empty) {
4676 smart_str_appendl(&str, line_sep, line_sep_len);
4677 }
4678 smart_str_appendl(&str, PHP_MBSTR_MAIL_MIME_HEADER4, sizeof(PHP_MBSTR_MAIL_MIME_HEADER4) - 1);
4679 p = (char *)mbfl_encoding_preferred_mime_name(body_enc);
4680 if (p == NULL) {
4681 p = "7bit";
4682 }
4683 smart_str_appends(&str, p);
4684 }
4685
4686 str_headers = smart_str_extract(&str);
4687
4688 zend_string *force_extra_parameters = zend_ini_str_ex("mail.force_extra_parameters", strlen("mail.force_extra_parameters"), false, NULL);
4689 if (force_extra_parameters) {
4690 extra_cmd = php_escape_shell_cmd(force_extra_parameters);
4691 } else if (extra_cmd) {
4692 extra_cmd = php_escape_shell_cmd(extra_cmd);
4693 }
4694
4695 RETVAL_BOOL(php_mail(to_r, ZSTR_VAL(subject), message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL));
4696
4697 if (extra_cmd) {
4698 zend_string_release_ex(extra_cmd, 0);
4699 }
4700 if (to_r != to) {
4701 efree(to_r);
4702 }
4703 zend_string_release(subject);
4704 zend_string_free(conv);
4705 zend_hash_destroy(&ht_headers);
4706 if (str_headers) {
4707 zend_string_release_ex(str_headers, 0);
4708 }
4709}
4710
4711#undef CRLF
4712#undef MAIL_ASCIIZ_CHECK_MBSTRING
4713#undef PHP_MBSTR_MAIL_MIME_HEADER1
4714#undef PHP_MBSTR_MAIL_MIME_HEADER2
4715#undef PHP_MBSTR_MAIL_MIME_HEADER3
4716#undef PHP_MBSTR_MAIL_MIME_HEADER4
4717/* }}} */
4718
4719/* {{{ Returns the current settings of mbstring */
4721{
4723 size_t n;
4724 char *name;
4725 zval row;
4726 const mbfl_encoding **entry;
4728
4729 ZEND_ASSERT(lang);
4730
4735
4736 if (!type || zend_string_equals_literal_ci(type, "all")) {
4739 add_assoc_string(return_value, "internal_encoding", (char *)MBSTRG(current_internal_encoding)->name);
4740 }
4742 add_assoc_string(return_value, "http_input", (char *)MBSTRG(http_input_identify)->name);
4743 }
4745 add_assoc_string(return_value, "http_output", (char *)MBSTRG(current_http_output_encoding)->name);
4746 }
4747
4748 add_assoc_str(return_value, "http_output_conv_mimetypes",
4749 zend_ini_str("mbstring.http_output_conv_mimetypes", sizeof("mbstring.http_output_conv_mimetypes") - 1, 0)
4750 );
4751
4752 name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4753 add_assoc_string(return_value, "mail_charset", name);
4754
4756 add_assoc_string(return_value, "mail_header_encoding", name);
4757
4759 add_assoc_string(return_value, "mail_body_encoding", name);
4760
4761 add_assoc_long(return_value, "illegal_chars", MBSTRG(illegalchars));
4762
4764 add_assoc_string(return_value, "encoding_translation", "On");
4765 } else {
4766 add_assoc_string(return_value, "encoding_translation", "Off");
4767 }
4768
4770 add_assoc_string(return_value, "language", name);
4771
4772 // TODO Seems to always have one entry at least?
4775 if (n > 0) {
4776 size_t i;
4777 array_init(&row);
4778 for (i = 0; i < n; i++) {
4779 add_next_index_string(&row, (*entry)->name);
4780 entry++;
4781 }
4782 add_assoc_zval(return_value, "detect_order", &row);
4783 }
4785 add_assoc_string(return_value, "substitute_character", "none");
4787 add_assoc_string(return_value, "substitute_character", "long");
4789 add_assoc_string(return_value, "substitute_character", "entity");
4790 } else {
4791 add_assoc_long(return_value, "substitute_character", MBSTRG(current_filter_illegal_substchar));
4792 }
4793 if (MBSTRG(strict_detection)) {
4794 add_assoc_string(return_value, "strict_detection", "On");
4795 } else {
4796 add_assoc_string(return_value, "strict_detection", "Off");
4797 }
4798 } else if (zend_string_equals_literal_ci(type, "internal_encoding")) {
4801 } else if (zend_string_equals_literal_ci(type, "http_input")) {
4804 }
4805 RETURN_NULL();
4806 } else if (zend_string_equals_literal_ci(type, "http_output")) {
4809 } else if (zend_string_equals_literal_ci(type, "http_output_conv_mimetypes")) {
4810 RETURN_STR(
4812 "mbstring.http_output_conv_mimetypes",
4813 sizeof("mbstring.http_output_conv_mimetypes") - 1,
4814 false
4815 )
4816 );
4817 } else if (zend_string_equals_literal_ci(type, "mail_charset")) {
4818 name = (char *)mbfl_no_encoding2name(lang->mail_charset);
4820 } else if (zend_string_equals_literal_ci(type, "mail_header_encoding")) {
4823 } else if (zend_string_equals_literal_ci(type, "mail_body_encoding")) {
4826 } else if (zend_string_equals_literal_ci(type, "illegal_chars")) {
4828 } else if (zend_string_equals_literal_ci(type, "encoding_translation")) {
4830 RETURN_STRING("On");
4831 } else {
4832 RETURN_STRING("Off");
4833 }
4834 } else if (zend_string_equals_literal_ci(type, "language")) {
4837 } else if (zend_string_equals_literal_ci(type, "detect_order")) {
4838 // TODO Seems to always have one entry at least?
4841 if (n > 0) {
4842 size_t i;
4844 for (i = 0; i < n; i++) {
4845 add_next_index_string(return_value, (*entry)->name);
4846 entry++;
4847 }
4848 }
4849 } else if (zend_string_equals_literal_ci(type, "substitute_character")) {
4851 RETURN_STRING("none");
4853 RETURN_STRING("long");
4855 RETURN_STRING("entity");
4856 } else {
4858 }
4859 } else if (zend_string_equals_literal_ci(type, "strict_detection")) {
4860 if (MBSTRG(strict_detection)) {
4861 RETURN_STRING("On");
4862 } else {
4863 RETURN_STRING("Off");
4864 }
4865 } else {
4866 php_error_docref(NULL, E_WARNING, "argument #1 ($type) must be a valid type");
4868 }
4869}
4870/* }}} */
4871
4872MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4873{
4874 uint32_t wchar_buf[128];
4875 unsigned char *in = (unsigned char*)input;
4876 unsigned int state = 0;
4877
4878 if (encoding->check != NULL) {
4879 return encoding->check(in, length);
4880 }
4881
4882 /* If the input string is not encoded in the given encoding, there is a significant chance
4883 * that this will be seen in the first bytes. Therefore, rather than converting an entire
4884 * buffer of 128 codepoints, convert and check just a few codepoints first */
4885 size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state);
4886 ZEND_ASSERT(out_len <= 8);
4887 for (unsigned int i = 0; i < out_len; i++) {
4888 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4889 return false;
4890 }
4891 }
4892
4893 while (length) {
4894 out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state);
4895 ZEND_ASSERT(out_len <= 128);
4896 for (unsigned int i = 0; i < out_len; i++) {
4897 if (wchar_buf[i] == MBFL_BAD_INPUT) {
4898 return false;
4899 }
4900 }
4901 }
4902
4903 return true;
4904}
4905
4906/* MSVC 32-bit has issues with 64-bit intrinsics.
4907 * (Bad 7/8-byte UTF-8 strings would be wrongly passed through as 'valid')
4908 * It seems this is caused by a bug in MS Visual C++
4909 * Ref: https://stackoverflow.com/questions/37509129/potential-bug-in-visual-studio-c-compiler-or-in-intel-intrinsics-avx2-mm256-s */
4910#if defined(PHP_WIN32) && !defined(__clang__) && defined(_MSC_VER) && defined(_M_IX86)
4911# define MBSTRING_BROKEN_X86_MSVC_INTRINSICS
4912#endif
4913
4914/* If we are building an AVX2-only binary, don't compile the next function */
4915#ifndef ZEND_INTRIN_AVX2_NATIVE
4916
4917/* SSE2-based function for validating UTF-8 strings
4918 * A faster implementation which uses AVX2 instructions follows */
4919static bool mb_fast_check_utf8_default(zend_string *str)
4920{
4921 unsigned char *p = (unsigned char*)ZSTR_VAL(str);
4922# ifdef __SSE2__
4923 /* `e` points 1 byte past the last full 16-byte block of string content
4924 * Note that we include the terminating null byte which is included in each zend_string
4925 * as part of the content to check; this ensures that multi-byte characters which are
4926 * truncated abruptly at the end of the string will be detected as invalid */
4927 unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m128i) - 1));
4928
4929 /* For checking for illegal bytes 0xF5-FF */
4930 const __m128i over_f5 = _mm_set1_epi8(-117);
4931 /* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4932 const __m128i over_9f = _mm_set1_epi8(-97);
4933 /* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4934 const __m128i over_8f = _mm_set1_epi8(-113);
4935 /* For checking for illegal bytes 0xC0-C1 */
4936 const __m128i find_c0 = _mm_set1_epi8(-64);
4937 const __m128i c0_to_c1 = _mm_set1_epi8(-126);
4938 /* For checking structure of continuation bytes */
4939 const __m128i find_e0 = _mm_set1_epi8(-32);
4940 const __m128i find_f0 = _mm_set1_epi8(-16);
4941
4942 __m128i last_block = _mm_setzero_si128();
4943 __m128i operand;
4944
4945 while (p < e) {
4946 operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
4947
4948check_operand:
4949 /* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4950 if (!_mm_movemask_epi8(operand)) {
4951 /* Even if this block only contains single-byte characters, there may have been a
4952 * multi-byte character at the end of the previous block, which was supposed to
4953 * have continuation bytes in this block
4954 * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4955 * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4956 * from the 3rd last */
4957 __m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
4958 __m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
4959 if (_mm_movemask_epi8(bad)) {
4960 return false;
4961 }
4962
4963 /* Consume as many full blocks of single-byte characters as we can */
4964 while (true) {
4965 p += sizeof(__m128i);
4966 if (p >= e) {
4967 goto finish_up_remaining_bytes;
4968 }
4969 operand = _mm_loadu_si128((__m128i*)p);
4970 if (_mm_movemask_epi8(operand)) {
4971 break;
4972 }
4973 }
4974 }
4975
4976 /* Check for >= 0xF5, which are illegal byte values in UTF-8
4977 * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4978 * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4979 * Then a single signed compare will pick out any bad bytes
4980 * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4981 __m128i bad = _mm_cmplt_epi8(_mm_add_epi8(operand, over_f5), over_f5);
4982
4983 /* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4984 * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4985 * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4986 * We can check for both problems at once by generating a vector where each byte < 0xA0
4987 * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4988 * Shift the original block right by one byte, and compare the shifted block with the bitmask */
4989 __m128i operand2 = _mm_or_si128(_mm_slli_si128(operand, 1), _mm_srli_si128(last_block, 15));
4990 __m128i mask1 = _mm_or_si128(find_e0, _mm_and_si128(_mm_set1_epi8(0xD), _mm_cmpgt_epi8(operand, over_9f)));
4991 bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask1));
4992
4993 /* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4994 * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4995 * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4996 * Build the bitmask and compare it with the shifted block */
4997 __m128i mask2 = _mm_or_si128(find_f0, _mm_and_si128(_mm_set1_epi8(0x4), _mm_cmpgt_epi8(operand, over_8f)));
4998 bad = _mm_or_si128(bad, _mm_cmpeq_epi8(operand2, mask2));
4999
5000 /* Check for overlong 2-byte code units
5001 * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
5002 * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
5003 * byte range, do a signed compare to pick out any bad bytes */
5004 bad = _mm_or_si128(bad, _mm_cmplt_epi8(_mm_add_epi8(operand, find_c0), c0_to_c1));
5005
5006 /* Check structure of continuation bytes
5007 * A UTF-8 byte should be a continuation byte if, and only if, it is:
5008 * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
5009 * 2) 2 bytes after the start of a 3-byte or 4-byte character
5010 * 3) 3 bytes after the start of a 4-byte character
5011 * We build 3 bitmasks with 0xFF in each such position, and OR them together to
5012 * get a single bitmask with 0xFF in each position where a continuation byte should be */
5013 __m128i cont_mask = _mm_cmpeq_epi8(_mm_and_si128(operand2, find_c0), find_c0);
5014 __m128i operand3 = _mm_or_si128(_mm_slli_si128(operand, 2), _mm_srli_si128(last_block, 14));
5015 cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand3, find_e0), find_e0));
5016 __m128i operand4 = _mm_or_si128(_mm_slli_si128(operand, 3), _mm_srli_si128(last_block, 13));
5017 cont_mask = _mm_or_si128(cont_mask, _mm_cmpeq_epi8(_mm_and_si128(operand4, find_f0), find_f0));
5018
5019 /* Now, use a signed comparison to get another bitmask with 0xFF in each position where
5020 * a continuation byte actually is
5021 * XOR those two bitmasks together; if everything is good, the result should be zero
5022 * However, if a byte which should have been a continuation wasn't, or if a byte which
5023 * shouldn't have been a continuation was, we will get 0xFF in that position */
5024 __m128i continuation = _mm_cmplt_epi8(operand, find_c0);
5025 bad = _mm_or_si128(bad, _mm_xor_si128(continuation, cont_mask));
5026
5027 /* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
5028 * If that value is non-zero, then we found a bad byte somewhere! */
5029 if (_mm_movemask_epi8(bad)) {
5030 return false;
5031 }
5032
5033 last_block = operand;
5034 p += sizeof(__m128i);
5035 }
5036
5037finish_up_remaining_bytes:
5038 /* Finish up 1-15 remaining bytes */
5039 if (p == e) {
5040 uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m128i) - 1); /* Not including terminating null */
5041
5042 /* Crazy hack here for cases where 9 or more bytes are remaining...
5043 * We want to use the above vectorized code to check a block of less than 16 bytes,
5044 * but there is no good way to read a variable number of bytes into an XMM register
5045 * However, we know that these bytes are part of a zend_string, and a zend_string has some
5046 * 'header' fields which occupy the memory just before its content
5047 * And, those header fields occupy more than 16 bytes...
5048 * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
5049 * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
5050 * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
5051 * Then, we do a left shift to get rid of the unwanted bytes
5052 * Conveniently, the same left shift also zero-fills the tail end of the XMM register
5053 *
5054 * The following `switch` looks useless, but it's not
5055 * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
5056 * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
5057 */
5058 switch (remaining_bytes) {
5059 case 0: ;
5060 __m128i bad_mask = _mm_set_epi8(-64, -32, -16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
5061 __m128i bad = _mm_cmpeq_epi8(_mm_and_si128(last_block, bad_mask), bad_mask);
5062 return _mm_movemask_epi8(bad) == 0;
5063 case 1:
5064 case 2:
5065 operand = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, *((uint16_t*)p));
5066 goto check_operand;
5067 case 3:
5068 case 4:
5069 operand = _mm_set_epi32(0, 0, 0, *((uint32_t*)p));
5070 goto check_operand;
5071 case 5:
5072 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10);
5073 goto check_operand;
5074 case 6:
5075 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9);
5076 goto check_operand;
5077 case 7:
5078 case 8:
5079#ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5080 operand = _mm_set_epi32(0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5081#else
5082 operand = _mm_set_epi64x(0, *((uint64_t*)p));
5083#endif
5084 goto check_operand;
5085 case 9:
5086 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6);
5087 goto check_operand;
5088 case 10:
5089 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5);
5090 goto check_operand;
5091 case 11:
5092 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4);
5093 goto check_operand;
5094 case 12:
5095 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3);
5096 goto check_operand;
5097 case 13:
5098 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2);
5099 goto check_operand;
5100 case 14:
5101 operand = _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1);
5102 goto check_operand;
5103 case 15:
5104 /* No trailing bytes are left which need to be checked
5105 * We get 15 because we did not include the terminating null when
5106 * calculating `remaining_bytes`, so the value wraps around */
5107 return true;
5108 }
5109
5111 }
5112
5113 return true;
5114# else
5115 /* This UTF-8 validation function is derived from PCRE2 */
5116 size_t length = ZSTR_LEN(str);
5117 /* Table of the number of extra bytes, indexed by the first byte masked with
5118 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
5119 static const uint8_t utf8_table[] = {
5120 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5121 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
5122 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
5123 3,3,3,3,3,3,3,3
5124 };
5125
5126 for (; length > 0; p++) {
5127 uint32_t d;
5128 unsigned char c = *p;
5129 length--;
5130
5131 if (c < 128) {
5132 /* ASCII character */
5133 continue;
5134 }
5135
5136 if (c < 0xc0) {
5137 /* Isolated 10xx xxxx byte */
5138 return false;
5139 }
5140
5141 if (c >= 0xf5) {
5142 return false;
5143 }
5144
5145 uint32_t ab = utf8_table[c & 0x3f]; /* Number of additional bytes (1-3) */
5146 if (length < ab) {
5147 /* Missing bytes */
5148 return false;
5149 }
5150 length -= ab;
5151
5152 /* Check top bits in the second byte */
5153 if (((d = *(++p)) & 0xc0) != 0x80) {
5154 return false;
5155 }
5156
5157 /* For each length, check that the remaining bytes start with the 0x80 bit
5158 * set and not the 0x40 bit. Then check for an overlong sequence, and for the
5159 * excluded range 0xd800 to 0xdfff. */
5160 switch (ab) {
5161 case 1:
5162 /* 2-byte character. No further bytes to check for 0x80. Check first byte
5163 * for xx00 000x (overlong sequence). */
5164 if ((c & 0x3e) == 0) {
5165 return false;
5166 }
5167 break;
5168
5169 case 2:
5170 /* 3-byte character. Check third byte for 0x80. Then check first 2 bytes for
5171 * 1110 0000, xx0x xxxx (overlong sequence) or 1110 1101, 1010 xxxx (0xd800-0xdfff) */
5172 if ((*(++p) & 0xc0) != 0x80 || (c == 0xe0 && (d & 0x20) == 0) || (c == 0xed && d >= 0xa0)) {
5173 return false;
5174 }
5175 break;
5176
5177 case 3:
5178 /* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
5179 * bytes for 1111 0000, xx00 xxxx (overlong sequence), then check for a
5180 * character greater than 0x0010ffff (f4 8f bf bf) */
5181 if ((*(++p) & 0xc0) != 0x80 || (*(++p) & 0xc0) != 0x80 || (c == 0xf0 && (d & 0x30) == 0) || (c > 0xf4 || (c == 0xf4 && d > 0x8f))) {
5182 return false;
5183 }
5184 break;
5185
5187 }
5188 }
5189
5190 return true;
5191# endif
5192}
5193
5194#endif /* #ifndef ZEND_INTRIN_AVX2_NATIVE */
5195
5196#ifdef ZEND_INTRIN_AVX2_NATIVE
5197
5198/* We are building AVX2-only binary */
5199# include <immintrin.h>
5200# define mb_fast_check_utf8 mb_fast_check_utf8_avx2
5201
5202#elif defined(ZEND_INTRIN_AVX2_RESOLVER)
5203
5204/* We are building binary which works with or without AVX2; whether or not to use
5205 * AVX2-accelerated functions will be determined at runtime */
5206# include <immintrin.h>
5207# include "Zend/zend_cpuinfo.h"
5208
5209# ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
5210/* Dynamic linker will decide whether or not to use AVX2-based functions and
5211 * resolve symbols accordingly */
5212
5213ZEND_INTRIN_AVX2_FUNC_DECL(bool mb_fast_check_utf8_avx2(zend_string *str));
5214
5215bool mb_fast_check_utf8(zend_string *str) __attribute__((ifunc("resolve_check_utf8")));
5216
5217typedef bool (*check_utf8_func_t)(zend_string*);
5218
5221static check_utf8_func_t resolve_check_utf8(void)
5222{
5223 if (zend_cpu_supports_avx2()) {
5224 return mb_fast_check_utf8_avx2;
5225 }
5226 return mb_fast_check_utf8_default;
5227}
5228
5229# else /* ZEND_INTRIN_AVX2_FUNC_PTR */
5230/* We are compiling for a target where the dynamic linker will not be able to
5231 * resolve symbols according to whether the host supports AVX2 or not; so instead,
5232 * we can make calls go through a function pointer and set the function pointer
5233 * on module load */
5234
5235#ifdef HAVE_FUNC_ATTRIBUTE_TARGET
5236static bool mb_fast_check_utf8_avx2(zend_string *str) __attribute__((target("avx2")));
5237#else
5238static bool mb_fast_check_utf8_avx2(zend_string *str);
5239#endif
5240
5241static bool (*check_utf8_ptr)(zend_string *str) = NULL;
5242
5243static bool mb_fast_check_utf8(zend_string *str)
5244{
5245 return check_utf8_ptr(str);
5246}
5247
5248static void init_check_utf8(void)
5249{
5250 if (zend_cpu_supports_avx2()) {
5251 check_utf8_ptr = mb_fast_check_utf8_avx2;
5252 } else {
5253 check_utf8_ptr = mb_fast_check_utf8_default;
5254 }
5255}
5256# endif
5257
5258#else
5259
5260/* No AVX2 support */
5261#define mb_fast_check_utf8 mb_fast_check_utf8_default
5262
5263#endif
5264
5265#if defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER)
5266
5267/* GCC prior to version 8 does not define all intrinsics. See GH-11514.
5268 * Use a workaround from https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values */
5269#if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && __GNUC__ < 8
5270# define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
5271#endif
5272
5273/* Take (256-bit) `hi` and `lo` as a 512-bit value, shift down by some
5274 * number of bytes, then take the low 256 bits
5275 * This is used to take some number of trailing bytes from the previous 32-byte
5276 * block followed by some number of leading bytes from the current 32-byte block
5277 *
5278 * _mm256_alignr_epi8 (VPALIGNR) is used to shift out bytes from a 256-bit
5279 * YMM register while shifting in bytes from another YMM register... but
5280 * it works separately on respective 128-bit halves of the YMM registers,
5281 * which is not what we want.
5282 * To make it work as desired, we first do _mm256_permute2x128_si256
5283 * (VPERM2I128) to combine the low 128 bits from the previous block and
5284 * the high 128 bits of the current block in one YMM register.
5285 * Then VPALIGNR will do what is needed. */
5286#define _mm256_shift_epi8(hi, lo, shift) _mm256_alignr_epi8(lo, _mm256_permute2x128_si256(hi, lo, 33), 16 - shift)
5287
5288/* AVX2-based UTF-8 validation function; validates text in 32-byte chunks
5289 *
5290 * Some parts of this function are the same as `mb_fast_check_utf8`; code comments
5291 * are not repeated, so consult `mb_fast_check_utf8` for information on uncommented
5292 * sections. */
5293#ifdef ZEND_INTRIN_AVX2_FUNC_PROTO
5294ZEND_API bool mb_fast_check_utf8_avx2(zend_string *str)
5295#else
5296static bool mb_fast_check_utf8_avx2(zend_string *str)
5297#endif
5298{
5299 unsigned char *p = (unsigned char*)ZSTR_VAL(str);
5300 unsigned char *e = p + ((ZSTR_LEN(str) + 1) & ~(sizeof(__m256i) - 1));
5301
5302 /* The algorithm used here for UTF-8 validation is partially adapted from the
5303 * paper "Validating UTF-8 In Less Than One Instruction Per Byte", by John Keiser
5304 * and Daniel Lemire.
5305 * Ref: https://arxiv.org/pdf/2010.03090.pdf
5306 *
5307 * Most types of invalid UTF-8 text can be detected by examining pairs of
5308 * successive bytes. Specifically:
5309 *
5310 * • Overlong 2-byte code units start with 0xC0 or 0xC1.
5311 * No valid UTF-8 string ever uses these byte values.
5312 * • Overlong 3-byte code units start with 0xE0, followed by a byte < 0xA0.
5313 * • Overlong 4-byte code units start with 0xF0, followed by a byte < 0x90.
5314 * • 5-byte or 6-byte code units, which should never be used, start with
5315 * 0xF8-FE.
5316 * • A codepoint value higher than U+10FFFF, which is the highest value for
5317 * any Unicode codepoint, would either start with 0xF4, followed by a
5318 * byte >= 0x90, or else would start with 0xF5-F7, followed by any value.
5319 * • A codepoint value from U+D800-DFFF, which are reserved and should never
5320 * be used, would start with 0xED, followed by a byte >= 0xA0.
5321 * • The byte value 0xFF is also illegal and is never used in valid UTF-8.
5322 *
5323 * To detect all these problems, for each pair of successive bytes, we do
5324 * table lookups using the high nibble of the first byte, the low nibble of
5325 * the first byte, and the high nibble of the second byte. Each table lookup
5326 * retrieves a bitmask, in which each 1 bit indicates a possible invalid
5327 * combination; AND those three bitmasks together, and any 1 bit in the result
5328 * will indicate an actual invalid byte combination was found.
5329 */
5330
5331#define BAD_BYTE 0x1
5332#define OVERLONG_2BYTE 0x2
5333#define _1BYTE (BAD_BYTE | OVERLONG_2BYTE)
5334#define OVERLONG_3BYTE 0x4
5335#define SURROGATE 0x8
5336#define OVERLONG_4BYTE 0x10
5337#define INVALID_CP 0x20
5338
5339 /* Each of these are 16-entry tables, repeated twice; this is required by the
5340 * VPSHUFB instruction which we use to perform 32 table lookups in parallel
5341 * The first entry is for 0xF, the second is for 0xE, and so on down to 0x0
5342 *
5343 * So, for example, notice that the 4th entry in the 1st table is OVERLONG_2BYTE;
5344 * that means that high nibble 0xC is consistent with the byte pair being part of
5345 * an overlong 2-byte code unit */
5346 const __m256i bad_hi_nibble2 = _mm256_set_epi8(
5347 BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5348 0, 0, 0, 0,
5349 0, 0, 0, 0,
5350 0, 0, 0, 0,
5351 BAD_BYTE | OVERLONG_4BYTE | INVALID_CP, OVERLONG_3BYTE | SURROGATE, 0, OVERLONG_2BYTE,
5352 0, 0, 0, 0,
5353 0, 0, 0, 0,
5354 0, 0, 0, 0);
5355 const __m256i bad_lo_nibble2 = _mm256_set_epi8(
5356 BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5357 BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5358 BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5359 0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5360 BAD_BYTE, BAD_BYTE, BAD_BYTE | SURROGATE, BAD_BYTE,
5361 BAD_BYTE, BAD_BYTE, BAD_BYTE, BAD_BYTE,
5362 BAD_BYTE, BAD_BYTE, BAD_BYTE, INVALID_CP,
5363 0, 0, OVERLONG_2BYTE, OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5364 const __m256i bad_hi_nibble = _mm256_set_epi8(
5365 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5366 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5367 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5368 _1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5369 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5370 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5371 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5372 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5373 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5374 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5375 _1BYTE | SURROGATE | INVALID_CP, _1BYTE | SURROGATE | INVALID_CP,
5376 _1BYTE | OVERLONG_3BYTE | INVALID_CP, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5377 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5378 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5379 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
5380 _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE, _1BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE);
5381
5382 const __m256i find_continuation = _mm256_set1_epi8(-64);
5383 const __m256i _b = _mm256_set1_epi8(0xB);
5384 const __m256i _d = _mm256_set1_epi8(0xD);
5385 const __m256i _f = _mm256_set1_epi8(0xF);
5386
5387 __m256i last_hi_nibbles = _mm256_setzero_si256(), last_lo_nibbles = _mm256_setzero_si256();
5388 __m256i operand;
5389
5390 while (p < e) {
5391 operand = _mm256_loadu_si256((__m256i*)p);
5392
5393check_operand:
5394 if (!_mm256_movemask_epi8(operand)) {
5395 /* Entire 32-byte block is ASCII characters; the only thing we need to validate is that
5396 * the previous block didn't end with an incomplete multi-byte character
5397 * (This will also confirm that the previous block didn't end with a bad byte like 0xFF) */
5398 __m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5399 __m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5400 if (_mm256_movemask_epi8(bad)) {
5401 return false;
5402 }
5403
5404 /* Consume as many full blocks of single-byte characters as we can */
5405 while (true) {
5406 p += sizeof(__m256i);
5407 if (p >= e) {
5408 goto finish_up_remaining_bytes;
5409 }
5410 operand = _mm256_loadu_si256((__m256i*)p);
5411 if (_mm256_movemask_epi8(operand)) {
5412 break;
5413 }
5414 }
5415 }
5416
5417 __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(operand, 4), _f);
5418 __m256i lo_nibbles = _mm256_and_si256(operand, _f);
5419
5420 __m256i lo_nibbles2 = _mm256_shift_epi8(last_lo_nibbles, lo_nibbles, 1);
5421 __m256i hi_nibbles2 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 1);
5422
5423 /* Do parallel table lookups in all 3 tables */
5424 __m256i bad = _mm256_cmpgt_epi8(
5425 _mm256_and_si256(
5426 _mm256_and_si256(
5427 _mm256_shuffle_epi8(bad_lo_nibble2, lo_nibbles2),
5428 _mm256_shuffle_epi8(bad_hi_nibble2, hi_nibbles2)),
5429 _mm256_shuffle_epi8(bad_hi_nibble, hi_nibbles)),
5430 _mm256_setzero_si256());
5431
5432 __m256i cont_mask = _mm256_cmpgt_epi8(hi_nibbles2, _b);
5433 __m256i hi_nibbles3 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 2);
5434 cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpgt_epi8(hi_nibbles3, _d));
5435 __m256i hi_nibbles4 = _mm256_shift_epi8(last_hi_nibbles, hi_nibbles, 3);
5436 cont_mask = _mm256_or_si256(cont_mask, _mm256_cmpeq_epi8(hi_nibbles4, _f));
5437
5438 __m256i continuation = _mm256_cmpgt_epi8(find_continuation, operand);
5439 bad = _mm256_or_si256(bad, _mm256_xor_si256(continuation, cont_mask));
5440
5441 if (_mm256_movemask_epi8(bad)) {
5442 return false;
5443 }
5444
5445 last_hi_nibbles = hi_nibbles;
5446 last_lo_nibbles = lo_nibbles;
5447 p += sizeof(__m256i);
5448 }
5449
5450finish_up_remaining_bytes:
5451 if (p == e) {
5452 uint8_t remaining_bytes = ZSTR_LEN(str) & (sizeof(__m256i) - 1); /* Not including terminating null */
5453
5454 switch (remaining_bytes) {
5455 case 0: ;
5456 /* No actual data bytes are remaining */
5457 __m256i bad_mask = _mm256_set_epi8(0xB, 0xD, 0xE, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127);
5458 __m256i bad = _mm256_cmpgt_epi8(last_hi_nibbles, bad_mask);
5459 return _mm256_movemask_epi8(bad) == 0;
5460 case 1:
5461 case 2:
5462 operand = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, *((int16_t*)p));
5463 goto check_operand;
5464 case 3:
5465 case 4:
5466 operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, *((int32_t*)p));
5467 goto check_operand;
5468 case 5:
5469 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 10)), 10));
5470 goto check_operand;
5471 case 6:
5472 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 9)), 9));
5473 goto check_operand;
5474 case 7:
5475 case 8:
5476#ifdef MBSTRING_BROKEN_X86_MSVC_INTRINSICS
5477 operand = _mm256_set_epi32(0, 0, 0, 0, 0, 0, ((int32_t*)p)[1], ((int32_t*)p)[0]);
5478#else
5479 operand = _mm256_set_epi64x(0, 0, 0, *((int64_t*)p));
5480#endif
5481 goto check_operand;
5482 case 9:
5483 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 6)), 6));
5484 goto check_operand;
5485 case 10:
5486 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 5)), 5));
5487 goto check_operand;
5488 case 11:
5489 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 4)), 4));
5490 goto check_operand;
5491 case 12:
5492 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 3)), 3));
5493 goto check_operand;
5494 case 13:
5495 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 2)), 2));
5496 goto check_operand;
5497 case 14:
5498 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_srli_si128(_mm_loadu_si128((__m128i*)(p - 1)), 1));
5499 goto check_operand;
5500 case 15:
5501 case 16:
5502 operand = _mm256_set_m128i(_mm_setzero_si128(), _mm_loadu_si128((__m128i*)p));
5503 goto check_operand;
5504 case 17:
5505 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 2)), 14), _mm_loadu_si128((__m128i*)p));
5506 goto check_operand;
5507 case 18:
5508 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 3)), 13), _mm_loadu_si128((__m128i*)p));
5509 goto check_operand;
5510 case 19:
5511 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 4)), 12), _mm_loadu_si128((__m128i*)p));
5512 goto check_operand;
5513 case 20:
5514 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 5)), 11), _mm_loadu_si128((__m128i*)p));
5515 goto check_operand;
5516 case 21:
5517 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 6)), 10), _mm_loadu_si128((__m128i*)p));
5518 goto check_operand;
5519 case 22:
5520 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 7)), 9), _mm_loadu_si128((__m128i*)p));
5521 goto check_operand;
5522 case 23:
5523 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 8)), 8), _mm_loadu_si128((__m128i*)p));
5524 goto check_operand;
5525 case 24:
5526 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 9)), 7), _mm_loadu_si128((__m128i*)p));
5527 goto check_operand;
5528 case 25:
5529 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 10)), 6), _mm_loadu_si128((__m128i*)p));
5530 goto check_operand;
5531 case 26:
5532 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 11)), 5), _mm_loadu_si128((__m128i*)p));
5533 goto check_operand;
5534 case 27:
5535 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 12)), 4), _mm_loadu_si128((__m128i*)p));
5536 goto check_operand;
5537 case 28:
5538 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 13)), 3), _mm_loadu_si128((__m128i*)p));
5539 goto check_operand;
5540 case 29:
5541 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 14)), 2), _mm_loadu_si128((__m128i*)p));
5542 goto check_operand;
5543 case 30:
5544 operand = _mm256_set_m128i(_mm_srli_si128(_mm_loadu_si128((__m128i*)(p + 15)), 1), _mm_loadu_si128((__m128i*)p));
5545 goto check_operand;
5546 case 31:
5547 return true;
5548 }
5549
5551 }
5552
5553 return true;
5554}
5555
5556#endif /* defined(ZEND_INTRIN_AVX2_NATIVE) || defined(ZEND_INTRIN_AVX2_RESOLVER) */
5557
5558static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
5559{
5560 if (encoding == &mbfl_encoding_utf8) {
5561 if (ZSTR_IS_VALID_UTF8(str)) {
5562 return true;
5563 }
5564 bool result = mb_fast_check_utf8(str);
5565 if (result && !ZSTR_IS_INTERNED(str)) {
5567 }
5568 return result;
5569 } else {
5570 return php_mb_check_encoding(ZSTR_VAL(str), ZSTR_LEN(str), encoding);
5571 }
5572}
5573
5574static bool php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
5575{
5576 zend_long idx;
5578 zval *entry;
5579 bool valid = true;
5580
5581 (void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
5582
5583 if (GC_IS_RECURSIVE(vars)) {
5584 php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
5585 return false;
5586 }
5588 ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
5589 ZVAL_DEREF(entry);
5590 if (key) {
5591 if (!mb_check_str_encoding(key, encoding)) {
5592 valid = false;
5593 break;
5594 }
5595 }
5596 switch (Z_TYPE_P(entry)) {
5597 case IS_STRING:
5598 if (!mb_check_str_encoding(Z_STR_P(entry), encoding)) {
5599 valid = false;
5600 break;
5601 }
5602 break;
5603 case IS_ARRAY:
5604 if (!php_mb_check_encoding_recursive(Z_ARRVAL_P(entry), encoding)) {
5605 valid = false;
5606 break;
5607 }
5608 break;
5609 case IS_LONG:
5610 case IS_DOUBLE:
5611 case IS_NULL:
5612 case IS_TRUE:
5613 case IS_FALSE:
5614 break;
5615 default:
5616 /* Other types are error. */
5617 valid = false;
5618 break;
5619 }
5622 return valid;
5623}
5624
5625/* {{{ Check if the string is valid for the specified encoding */
5627{
5628 zend_string *input_str = NULL, *enc = NULL;
5629 HashTable *input_ht = NULL;
5630 const mbfl_encoding *encoding;
5631
5634 Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(input_ht, input_str)
5637
5638 encoding = php_mb_get_encoding(enc, 2);
5639 if (!encoding) {
5640 RETURN_THROWS();
5641 }
5642
5643 if (input_ht) {
5644 RETURN_BOOL(php_mb_check_encoding_recursive(input_ht, encoding));
5645 } else if (input_str) {
5646 RETURN_BOOL(mb_check_str_encoding(input_str, encoding));
5647 } else {
5649 "Calling mb_check_encoding() without argument is deprecated");
5650
5651 /* FIXME: Actually check all inputs, except $_FILES file content. */
5653 }
5654}
5655/* }}} */
5656
5657static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string *enc_name,
5658 const uint32_t enc_name_arg_num)
5659{
5660 const mbfl_encoding *enc;
5661 enum mbfl_no_encoding no_enc;
5662
5663 ZEND_ASSERT(str_len > 0);
5664
5665 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5666 if (!enc) {
5667 return -2;
5668 }
5669
5670 no_enc = enc->no_encoding;
5671 if (php_mb_is_unsupported_no_encoding(no_enc)) {
5672 zend_value_error("mb_ord() does not support the \"%s\" encoding", enc->name);
5673 return -2;
5674 }
5675
5676 /* Some legacy text encodings have a minimum required wchar buffer size;
5677 * the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
5678 uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
5679 unsigned int state = 0;
5680 size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
5682
5683 if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
5684 return -1;
5685 }
5686 return wchar_buf[0];
5687}
5688
5689/* {{{ */
5691{
5692 char *str;
5693 size_t str_len;
5694 zend_string *enc = NULL;
5695 zend_long cp;
5696
5698 Z_PARAM_STRING(str, str_len)
5702
5703 if (str_len == 0) {
5705 RETURN_THROWS();
5706 }
5707
5708 cp = php_mb_ord(str, str_len, enc, 2);
5709
5710 if (0 > cp) {
5711 if (cp == -2) {
5712 RETURN_THROWS();
5713 }
5715 }
5716
5717 RETURN_LONG(cp);
5718}
5719/* }}} */
5720
5721static inline zend_string *php_mb_chr(zend_long cp, zend_string *enc_name, uint32_t enc_name_arg_num)
5722{
5723 const mbfl_encoding *enc;
5724 enum mbfl_no_encoding no_enc;
5726 char buf[4];
5727
5728 enc = php_mb_get_encoding(enc_name, enc_name_arg_num);
5729 if (!enc) {
5730 return NULL;
5731 }
5732
5733 no_enc = enc->no_encoding;
5734 if (php_mb_is_unsupported_no_encoding(no_enc)) {
5735 zend_value_error("mb_chr() does not support the \"%s\" encoding", enc->name);
5736 return NULL;
5737 }
5738
5739 if (cp < 0 || cp > 0x10ffff) {
5740 return NULL;
5741 }
5742
5743 if (php_mb_is_no_encoding_utf8(no_enc)) {
5744 if (cp > 0xd7ff && 0xe000 > cp) {
5745 return NULL;
5746 }
5747
5748 if (cp < 0x80) {
5749 ret = ZSTR_CHAR(cp);
5750 } else if (cp < 0x800) {
5751 ret = zend_string_alloc(2, 0);
5752 ZSTR_VAL(ret)[0] = 0xc0 | (cp >> 6);
5753 ZSTR_VAL(ret)[1] = 0x80 | (cp & 0x3f);
5754 ZSTR_VAL(ret)[2] = 0;
5755 } else if (cp < 0x10000) {
5756 ret = zend_string_alloc(3, 0);
5757 ZSTR_VAL(ret)[0] = 0xe0 | (cp >> 12);
5758 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 6) & 0x3f);
5759 ZSTR_VAL(ret)[2] = 0x80 | (cp & 0x3f);
5760 ZSTR_VAL(ret)[3] = 0;
5761 } else {
5762 ret = zend_string_alloc(4, 0);
5763 ZSTR_VAL(ret)[0] = 0xf0 | (cp >> 18);
5764 ZSTR_VAL(ret)[1] = 0x80 | ((cp >> 12) & 0x3f);
5765 ZSTR_VAL(ret)[2] = 0x80 | ((cp >> 6) & 0x3f);
5766 ZSTR_VAL(ret)[3] = 0x80 | (cp & 0x3f);
5767 ZSTR_VAL(ret)[4] = 0;
5768 }
5769
5770 return ret;
5771 }
5772
5773 buf[0] = (cp >> 24) & 0xff;
5774 buf[1] = (cp >> 16) & 0xff;
5775 buf[2] = (cp >> 8) & 0xff;
5776 buf[3] = cp & 0xff;
5777
5778 long orig_illegalchars = MBSTRG(illegalchars);
5779 MBSTRG(illegalchars) = 0;
5781
5782 if (MBSTRG(illegalchars) != 0) {
5783 zend_string_release(ret);
5784 ret = NULL;
5785 }
5786
5787 MBSTRG(illegalchars) = orig_illegalchars;
5788 return ret;
5789}
5790
5791/* {{{ */
5793{
5794 zend_long cp;
5795 zend_string *enc = NULL;
5796
5802
5803 zend_string* ret = php_mb_chr(cp, enc, 2);
5804 if (ret == NULL) {
5806 }
5807
5808 RETURN_STR(ret);
5809}
5810/* }}} */
5811
5813{
5814 zend_string *input, *encoding_str = NULL, *pad = ZSTR_CHAR(' ');
5815 zend_long pad_to_length;
5816 zend_long pad_type_val = PHP_STR_PAD_RIGHT;
5817
5819 Z_PARAM_STR(input)
5820 Z_PARAM_LONG(pad_to_length)
5822 Z_PARAM_STR(pad)
5823 Z_PARAM_LONG(pad_type_val)
5824 Z_PARAM_STR_OR_NULL(encoding_str)
5826
5827 const mbfl_encoding *encoding = php_mb_get_encoding(encoding_str, 5);
5828 if (!encoding) {
5829 RETURN_THROWS();
5830 }
5831
5832 size_t input_length = mb_get_strlen(input, encoding);
5833
5834 /* If resulting string turns out to be shorter than input string,
5835 we simply copy the input and return. */
5836 if (pad_to_length < 0 || (size_t)pad_to_length <= input_length) {
5837 RETURN_STR_COPY(input);
5838 }
5839
5840 if (ZSTR_LEN(pad) == 0) {
5842 RETURN_THROWS();
5843 }
5844
5845 if (pad_type_val < PHP_STR_PAD_LEFT || pad_type_val > PHP_STR_PAD_BOTH) {
5846 zend_argument_value_error(4, "must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH");
5847 RETURN_THROWS();
5848 }
5849
5850 size_t pad_length = mb_get_strlen(pad, encoding);
5851
5852 size_t num_mb_pad_chars = pad_to_length - input_length;
5853
5854 /* We need to figure out the left/right padding lengths. */
5855 size_t left_pad = 0, right_pad = 0; /* Initialize here to silence compiler warnings. */
5856 switch (pad_type_val) {
5857 case PHP_STR_PAD_RIGHT:
5858 right_pad = num_mb_pad_chars;
5859 break;
5860
5861 case PHP_STR_PAD_LEFT:
5862 left_pad = num_mb_pad_chars;
5863 break;
5864
5865 case PHP_STR_PAD_BOTH:
5866 left_pad = num_mb_pad_chars / 2;
5867 right_pad = num_mb_pad_chars - left_pad;
5868 break;
5869 }
5870
5871 /* How many full block copies need to happen, and how many characters are then left over? */
5872 size_t full_left_pad_copies = left_pad / pad_length;
5873 size_t full_right_pad_copies = right_pad / pad_length;
5874 size_t remaining_left_pad_chars = left_pad % pad_length;
5875 size_t remaining_right_pad_chars = right_pad % pad_length;
5876
5877 if (UNEXPECTED(full_left_pad_copies > SIZE_MAX / ZSTR_LEN(pad) || full_right_pad_copies > SIZE_MAX / ZSTR_LEN(pad))) {
5878 goto overflow_no_release;
5879 }
5880
5881 /* Compute the number of bytes required for the padding */
5882 size_t full_left_pad_bytes = full_left_pad_copies * ZSTR_LEN(pad);
5883 size_t full_right_pad_bytes = full_right_pad_copies * ZSTR_LEN(pad);
5884
5885 /* No special fast-path handling necessary for zero-length pads because these functions will not
5886 * allocate memory in case a zero-length pad is required. */
5887 zend_string *remaining_left_pad_str = mb_get_substr(pad, 0, remaining_left_pad_chars, encoding);
5888 zend_string *remaining_right_pad_str = mb_get_substr(pad, 0, remaining_right_pad_chars, encoding);
5889
5890 if (UNEXPECTED(full_left_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_left_pad_str)
5891 || full_right_pad_bytes > ZSTR_MAX_LEN - ZSTR_LEN(remaining_right_pad_str))) {
5892 goto overflow;
5893 }
5894
5895 size_t left_pad_bytes = full_left_pad_bytes + ZSTR_LEN(remaining_left_pad_str);
5896 size_t right_pad_bytes = full_right_pad_bytes + ZSTR_LEN(remaining_right_pad_str);
5897
5898 if (UNEXPECTED(left_pad_bytes > ZSTR_MAX_LEN - right_pad_bytes
5899 || ZSTR_LEN(input) > ZSTR_MAX_LEN - left_pad_bytes - right_pad_bytes)) {
5900 goto overflow;
5901 }
5902
5903 zend_string *result = zend_string_alloc(ZSTR_LEN(input) + left_pad_bytes + right_pad_bytes, false);
5904 char *buffer = ZSTR_VAL(result);
5905
5906 /* First we pad the left. */
5907 for (size_t i = 0; i < full_left_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5908 memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5909 }
5910 memcpy(buffer, ZSTR_VAL(remaining_left_pad_str), ZSTR_LEN(remaining_left_pad_str));
5911 buffer += ZSTR_LEN(remaining_left_pad_str);
5912
5913 /* Then we copy the input string. */
5914 memcpy(buffer, ZSTR_VAL(input), ZSTR_LEN(input));
5915 buffer += ZSTR_LEN(input);
5916
5917 /* Finally, we pad on the right. */
5918 for (size_t i = 0; i < full_right_pad_copies; i++, buffer += ZSTR_LEN(pad)) {
5919 memcpy(buffer, ZSTR_VAL(pad), ZSTR_LEN(pad));
5920 }
5921 memcpy(buffer, ZSTR_VAL(remaining_right_pad_str), ZSTR_LEN(remaining_right_pad_str));
5922
5923 ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';
5924
5925 zend_string_release_ex(remaining_left_pad_str, false);
5926 zend_string_release_ex(remaining_right_pad_str, false);
5927
5929
5930overflow:
5931 zend_string_release_ex(remaining_left_pad_str, false);
5932 zend_string_release_ex(remaining_right_pad_str, false);
5933overflow_no_release:
5934 zend_throw_error(NULL, "String size overflow");
5935 RETURN_THROWS();
5936}
5937
5938/* {{{ */
5940{
5941 zend_string *str, *enc_name = NULL;
5942
5944 Z_PARAM_STR(str)
5946 Z_PARAM_STR_OR_NULL(enc_name)
5948
5949 const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 2);
5950 if (!enc) {
5951 RETURN_THROWS();
5952 }
5953
5954 if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
5955 /* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
5956 RETURN_STR_COPY(str);
5957 }
5958
5960}
5961/* }}} */
5962
5963/* {{{ php_mb_populate_current_detect_order_list */
5964static void php_mb_populate_current_detect_order_list(void)
5965{
5966 const mbfl_encoding **entry = 0;
5967 size_t nentries;
5968
5970 nentries = MBSTRG(detect_order_list_size);
5971 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5972 memcpy(ZEND_VOIDP(entry), MBSTRG(detect_order_list), sizeof(mbfl_encoding*) * nentries);
5973 } else {
5975 size_t i;
5977 entry = (const mbfl_encoding **)safe_emalloc(nentries, sizeof(mbfl_encoding*), 0);
5978 for (i = 0; i < nentries; i++) {
5979 entry[i] = mbfl_no2encoding(src[i]);
5980 }
5981 }
5984}
5985/* }}} */
5986
5987/* {{{ static int php_mb_encoding_translation() */
5988static int php_mb_encoding_translation(void)
5989{
5991}
5992/* }}} */
5993
5995{
5996 if (enc) {
5997 if (enc->mblen_table) {
5998 if (s) {
5999 return enc->mblen_table[*(unsigned char *)s];
6000 }
6001 } else if (enc->flag & MBFL_ENCTYPE_WCS2) {
6002 return 2;
6003 } else if (enc->flag & MBFL_ENCTYPE_WCS4) {
6004 return 4;
6005 }
6006 }
6007 return 1;
6008}
6009
6010MBSTRING_API char *php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
6011{
6012 const char *p = s;
6013 char *last=NULL;
6014
6015 if (nbytes == (size_t)-1) {
6016 size_t nb = 0;
6017
6018 while (*p != '\0') {
6019 if (nb == 0) {
6020 if ((unsigned char)*p == (unsigned char)c) {
6021 last = (char *)p;
6022 }
6023 nb = php_mb_mbchar_bytes(p, enc);
6024 if (nb == 0) {
6025 return NULL; /* something is going wrong! */
6026 }
6027 }
6028 --nb;
6029 ++p;
6030 }
6031 } else {
6032 size_t bcnt = nbytes;
6033 size_t nbytes_char;
6034 while (bcnt > 0) {
6035 if ((unsigned char)*p == (unsigned char)c) {
6036 last = (char *)p;
6037 }
6038 nbytes_char = php_mb_mbchar_bytes(p, enc);
6039 if (bcnt < nbytes_char) {
6040 return NULL;
6041 }
6042 p += nbytes_char;
6043 bcnt -= nbytes_char;
6044 }
6045 }
6046 return last;
6047}
6048
6050{
6051 /* We're using simple case-folding here, because we'd have to deal with remapping of
6052 * offsets otherwise. */
6055
6056 size_t n = mb_find_strpos(haystack_conv, needle_conv, &mbfl_encoding_utf8, offset, mode);
6057
6058 zend_string_free(haystack_conv);
6059 zend_string_free(needle_conv);
6060
6061 return n;
6062}
6063
6064static void php_mb_gpc_get_detect_order(const zend_encoding ***list, size_t *list_size) /* {{{ */
6065{
6066 *list = (const zend_encoding **)MBSTRG(http_input_list);
6067 *list_size = MBSTRG(http_input_list_size);
6068}
6069/* }}} */
6070
6071static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{ */
6072{
6074}
6075/* }}} */
6076
6077static const unsigned char base64_table[] = {
6078 /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
6079 0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
6080 /* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
6081 0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
6082 /* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
6083 0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
6084 /* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
6085 0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
6086 /* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
6087 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
6088};
6089
6090static size_t transfer_encoded_size(mb_convert_buf *tmpbuf, bool base64)
6091{
6092 if (base64) {
6093 return ((mb_convert_buf_len(tmpbuf) + 2) / 3) * 4;
6094 } else {
6095 size_t enc_size = 0;
6096 unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str);
6097 while (p < tmpbuf->out) {
6098 unsigned char c = *p++;
6099 enc_size += (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) ? 3 : 1;
6100 }
6101 return enc_size;
6102 }
6103}
6104
6105static void transfer_encode_mime_bytes(mb_convert_buf *tmpbuf, mb_convert_buf *outbuf, bool base64)
6106{
6107 unsigned char *out, *limit;
6108 MB_CONVERT_BUF_LOAD(outbuf, out, limit);
6109 unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str), *e = tmpbuf->out;
6110
6111 if (base64) {
6112 MB_CONVERT_BUF_ENSURE(outbuf, out, limit, ((e - p) + 2) / 3 * 4);
6113 while ((e - p) >= 3) {
6114 unsigned char a = *p++;
6115 unsigned char b = *p++;
6116 unsigned char c = *p++;
6117 uint32_t bits = (a << 16) | (b << 8) | c;
6118 out = mb_convert_buf_add4(out,
6119 base64_table[(bits >> 18) & 0x3F],
6120 base64_table[(bits >> 12) & 0x3F],
6121 base64_table[(bits >> 6) & 0x3F],
6122 base64_table[bits & 0x3F]);
6123 }
6124 if (p != e) {
6125 if ((e - p) == 1) {
6126 uint32_t bits = *p++;
6127 out = mb_convert_buf_add4(out, base64_table[(bits >> 2) & 0x3F], base64_table[(bits & 0x3) << 4], '=', '=');
6128 } else {
6129 unsigned char a = *p++;
6130 unsigned char b = *p++;
6131 uint32_t bits = (a << 8) | b;
6132 out = mb_convert_buf_add4(out, base64_table[(bits >> 10) & 0x3F], base64_table[(bits >> 4) & 0x3F], base64_table[(bits & 0xF) << 2], '=');
6133 }
6134 }
6135 } else {
6136 MB_CONVERT_BUF_ENSURE(outbuf, out, limit, (e - p) * 3);
6137 while (p < e) {
6138 unsigned char c = *p++;
6139 if (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) {
6140 out = mb_convert_buf_add3(out, '=', "0123456789ABCDEF"[(c >> 4) & 0xF], "0123456789ABCDEF"[c & 0xF]);
6141 } else {
6142 out = mb_convert_buf_add(out, c);
6143 }
6144 }
6145 }
6146
6147 mb_convert_buf_reset(tmpbuf, 0);
6148 MB_CONVERT_BUF_STORE(outbuf, out, limit);
6149}
6150
6151#define MBSTRING_HEADER_ENC_WCHAR_BUFSIZE 90
6152
6153static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent)
6154{
6155 unsigned char *in = (unsigned char*)ZSTR_VAL(input);
6156 size_t in_len = ZSTR_LEN(input);
6157
6158 ZEND_ASSERT(outcode->mime_name != NULL);
6159 ZEND_ASSERT(outcode->mime_name[0] != '\0');
6160
6161 if (!in_len) {
6162 return zend_empty_string;
6163 }
6164
6165 if (indent < 0 || indent >= 74) {
6166 indent = 0;
6167 }
6168
6169 if (linefeed_len > 8) {
6170 linefeed_len = 8;
6171 }
6172 /* Maintain legacy behavior as regards embedded NUL (zero) bytes in linefeed string */
6173 for (size_t i = 0; i < linefeed_len; i++) {
6174 if (linefeed[i] == '\0') {
6175 linefeed_len = i;
6176 break;
6177 }
6178 }
6179
6180 unsigned int state = 0;
6181 /* wchar_buf should be big enough that when it is full, we definitely have enough
6182 * wchars to fill an entire line of output */
6183 uint32_t wchar_buf[MBSTRING_HEADER_ENC_WCHAR_BUFSIZE];
6184 uint32_t *p, *e;
6185 /* What part of wchar_buf is filled with still-unprocessed data which should not
6186 * be overwritten? */
6187 unsigned int offset = 0;
6188 size_t line_start = 0;
6189
6190 /* If the entire input string is ASCII with no spaces (except possibly leading
6191 * spaces), just pass it through unchanged */
6192 bool checking_leading_spaces = true;
6193 while (in_len) {
6194 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE, &state);
6195 p = wchar_buf;
6196 e = wchar_buf + out_len;
6197
6198 while (p < e) {
6199 uint32_t w = *p++;
6200 if (checking_leading_spaces) {
6201 if (w == ' ') {
6202 continue;
6203 } else {
6204 checking_leading_spaces = false;
6205 }
6206 }
6207 if (w < 0x21 || w > 0x7E || w == '=' || w == '?' || w == '_') {
6208 /* We cannot simply pass input string through unchanged; start again */
6209 in = (unsigned char*)ZSTR_VAL(input);
6210 in_len = ZSTR_LEN(input);
6211 goto no_passthrough;
6212 }
6213 }
6214 }
6215
6216 return zend_string_copy(input); /* This just increments refcount */
6217
6218no_passthrough: ;
6219
6221 mb_convert_buf_init(&buf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6222
6223 /* Encode some prefix of the input string as plain ASCII if possible
6224 * If we find it necessary to switch to Base64/QPrint encoding, we will
6225 * do so all the way to the end of the string */
6226 while (in_len) {
6227 /* Decode part of the input string, refill wchar_buf */
6229 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6231 p = wchar_buf;
6232 e = wchar_buf + offset + out_len;
6233 /* ASCII output is broken into space-delimited 'words'
6234 * If we find a non-ASCII character in the middle of a word, we will
6235 * transfer-encode the entire word */
6236 uint32_t *word_start = p;
6237
6238 /* Don't consider adding line feed for spaces at the beginning of a word */
6239 while (p < e && *p == ' ' && (p - word_start) <= 74) {
6240 p++;
6241 }
6242
6243 while (p < e) {
6244 uint32_t w = *p++;
6245
6246 if (w < 0x20 || w > 0x7E || w == '?' || w == '=' || w == '_' || (w == ' ' && (p - word_start) > 74)) {
6247 /* Non-ASCII character (or line too long); switch to Base64/QPrint encoding
6248 * If we are already too far along on a line to include Base64/QPrint encoded data
6249 * on the same line (without overrunning max line length), then add a line feed
6250 * right now */
6251feed_and_mime_encode:
6252 if (mb_convert_buf_len(&buf) - line_start + indent + strlen(outcode->mime_name) > 55) {
6253 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6254 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6255 buf.out = mb_convert_buf_add(buf.out, ' ');
6256 indent = 0;
6257 line_start = mb_convert_buf_len(&buf);
6258 } else if (mb_convert_buf_len(&buf) > 0) {
6259 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 1);
6260 buf.out = mb_convert_buf_add(buf.out, ' ');
6261 }
6262 p = word_start; /* Back up to where MIME encoding of input chars should start */
6263 goto mime_encoding_needed;
6264 } else if (w == ' ') {
6265 /* When we see a space, check whether we should insert a line break */
6266 if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 75) {
6267 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6268 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6269 buf.out = mb_convert_buf_add(buf.out, ' ');
6270 indent = 0;
6271 line_start = mb_convert_buf_len(&buf);
6272 } else if (mb_convert_buf_len(&buf) > 0) {
6273 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6274 buf.out = mb_convert_buf_add(buf.out, ' ');
6275 }
6276 /* Output one (space-delimited) word as plain ASCII */
6277 while (word_start < p-1) {
6278 buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6279 }
6280 word_start++;
6281 while (p < e && *p == ' ') {
6282 p++;
6283 }
6284 }
6285 }
6286
6287 if (in_len) {
6288 /* Copy chars which are part of an incomplete 'word' to the beginning
6289 * of wchar_buf and reprocess them on the next iteration.
6290 * But first make sure that the incomplete 'word' isn't so big that
6291 * there will be no space to add any more decoded wchars in the buffer
6292 * (which could lead to an infinite loop) */
6293 if ((word_start - wchar_buf) < MBSTRING_MIN_WCHAR_BUFSIZE) {
6294 goto feed_and_mime_encode;
6295 }
6296 offset = e - word_start;
6297 if (offset) {
6298 memmove(wchar_buf, word_start, offset * sizeof(uint32_t));
6299 }
6300 } else {
6301 /* We have reached the end of the input string while still in 'ASCII mode';
6302 * process any trailing ASCII chars which were not followed by a space */
6303 if (word_start < e && mb_convert_buf_len(&buf) > 0) {
6304 /* The whole input string was not just one big ASCII 'word' with no spaces
6305 * consider adding a line feed if necessary to prevent output lines from
6306 * being too long */
6307 if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 74) {
6308 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
6309 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6310 buf.out = mb_convert_buf_add(buf.out, ' ');
6311 } else {
6312 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
6313 buf.out = mb_convert_buf_add(buf.out, ' ');
6314 }
6315 }
6316 while (word_start < e) {
6317 buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
6318 }
6319 }
6320 }
6321
6322 /* Ensure output string is marked as valid UTF-8 (ASCII strings are always 'valid UTF-8') */
6323 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6324
6325mime_encoding_needed: ;
6326
6327 /* We will generate the output line by line, first converting wchars to bytes
6328 * in the requested output encoding, then transfer-encoding those bytes as
6329 * Base64 or QPrint
6330 * 'tmpbuf' will receive the bytes which need to be transfer-encoded before
6331 * sending them to 'buf' */
6332 mb_convert_buf tmpbuf;
6333 mb_convert_buf_init(&tmpbuf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6334
6335 /* Do we need to refill wchar_buf to make sure we don't run out of wchars
6336 * in the middle of a line? */
6337 offset = e - p;
6339 goto start_new_line;
6340 }
6341 memmove(wchar_buf, p, offset * sizeof(uint32_t));
6342
6343 while(true) {
6344refill_wchar_buf: ;
6346 size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, MBSTRING_HEADER_ENC_WCHAR_BUFSIZE - offset, &state);
6348 p = wchar_buf;
6349 e = wchar_buf + offset + out_len;
6350
6351start_new_line: ;
6352 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, strlen(outcode->mime_name) + 5);
6353 buf.out = mb_convert_buf_add2(buf.out, '=', '?');
6354 buf.out = mb_convert_buf_appends(buf.out, outcode->mime_name);
6355 buf.out = mb_convert_buf_add3(buf.out, '?', base64 ? 'B' : 'Q', '?');
6356
6357 /* How many wchars should we try converting to Base64/QPrint-encoded bytes?
6358 * We do something like a 'binary search' to find the greatest number which
6359 * can be included on this line without exceeding max line length */
6360 unsigned int n = 12;
6361 size_t space_available = 73 - indent - (mb_convert_buf_len(&buf) - line_start);
6362
6363 while (true) {
6364 ZEND_ASSERT(p < e);
6365
6366 /* Remember where we were in process of generating output, so we can back
6367 * up if necessary */
6368 size_t tmppos = mb_convert_buf_len(&tmpbuf);
6369 unsigned int tmpstate = tmpbuf.state;
6370
6371 /* Try encoding 'n' wchars in output text encoding and sending output
6372 * bytes to 'tmpbuf'. Hopefully this is not too many to fit on the
6373 * current line. */
6374 n = MIN(n, e - p);
6375 outcode->from_wchar(p, n, &tmpbuf, false);
6376
6377 /* For some output text encodings, there may be a few ending bytes
6378 * which need to be emitted to output before we break a line.
6379 * Again, remember where we were so we can back up */
6380 size_t tmppos2 = mb_convert_buf_len(&tmpbuf);
6381 unsigned int tmpstate2 = tmpbuf.state;
6382 outcode->from_wchar(NULL, 0, &tmpbuf, true);
6383
6384 if (transfer_encoded_size(&tmpbuf, base64) <= space_available || (n == 1 && tmppos == 0)) {
6385 /* If we convert 'n' more wchars on the current line, it will not
6386 * overflow the maximum line length */
6387 p += n;
6388
6389 if (p == e) {
6390 /* We are done; we shouldn't reach here if there is more remaining
6391 * of the input string which needs to be processed */
6393 transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6394 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 2);
6395 buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6396 mb_convert_buf_free(&tmpbuf);
6397 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6398 } else {
6399 /* It's possible that more chars might fit on the current line,
6400 * so back up to where we were before emitting any ending bytes */
6401 mb_convert_buf_reset(&tmpbuf, tmppos2);
6402 tmpbuf.state = tmpstate2;
6403 }
6404 } else {
6405 /* Converting 'n' more wchars on this line would be too much.
6406 * Back up to where we were before we tried that. */
6407 mb_convert_buf_reset(&tmpbuf, tmppos);
6408 tmpbuf.state = tmpstate;
6409
6410 if (n == 1) {
6411 /* We have found the exact number of chars which will fit on the
6412 * current line. Finish up and move to a new line. */
6413 outcode->from_wchar(NULL, 0, &tmpbuf, true);
6414 transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
6415 tmpbuf.state = 0;
6416
6417 MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 3 + linefeed_len);
6418 buf.out = mb_convert_buf_add2(buf.out, '?', '=');
6419
6420 indent = 0; /* Indent argument must only affect the first line */
6421
6422 if (in_len || p < e) {
6423 /* We still have more input to process */
6424 buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
6425 buf.out = mb_convert_buf_add(buf.out, ' ');
6426 line_start = mb_convert_buf_len(&buf);
6427 offset = e - p;
6429 /* Copy any remaining wchars to beginning of buffer and refill
6430 * the rest of the buffer */
6431 memmove(wchar_buf, p, offset * sizeof(uint32_t));
6432 goto refill_wchar_buf;
6433 }
6434 goto start_new_line;
6435 } else {
6436 /* We are done! */
6437 mb_convert_buf_free(&tmpbuf);
6438 return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
6439 }
6440 } else {
6441 /* Try a smaller number of wchars */
6442 n = MAX(n >> 1, 1);
6443 }
6444 }
6445 }
6446 }
6447}
6448
6450{
6452 zend_string *str, *charset_name = NULL, *transenc_name = NULL;
6453 char *linefeed = "\r\n";
6454 size_t linefeed_len = 2;
6455 zend_long indent = 0;
6456 bool base64 = true;
6457
6459 Z_PARAM_STR(str)
6461 Z_PARAM_STR(charset_name)
6462 Z_PARAM_STR(transenc_name)
6463 Z_PARAM_STRING(linefeed, linefeed_len)
6464 Z_PARAM_LONG(indent)
6466
6467 if (charset_name != NULL) {
6468 charset = php_mb_get_encoding(charset_name, 2);
6469 if (!charset) {
6470 RETURN_THROWS();
6471 } else if (charset->mime_name == NULL || charset->mime_name[0] == '\0' || charset == &mbfl_encoding_qprint) {
6472 zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
6473 RETURN_THROWS();
6474 }
6475 } else {
6477 if (lang != NULL) {
6479 const mbfl_encoding *transenc = mbfl_no2encoding(lang->mail_header_encoding);
6480 char t = transenc->name[0];
6481 if (t == 'Q' || t == 'q') {
6482 base64 = false;
6483 }
6484 }
6485 }
6486
6487 if (transenc_name != NULL && ZSTR_LEN(transenc_name) > 0) {
6488 char t = ZSTR_VAL(transenc_name)[0];
6489 if (t == 'Q' || t == 'q') {
6490 base64 = false;
6491 }
6492 }
6493
6494 RETURN_STR(mb_mime_header_encode(str, MBSTRG(current_internal_encoding), charset, base64, linefeed, linefeed_len, indent));
6495}
6496
6497static int8_t decode_base64(unsigned char c)
6498{
6499 if (c >= 'A' && c <= 'Z') {
6500 return c - 'A';
6501 } else if (c >= 'a' && c <= 'z') {
6502 return c - 'a' + 26;
6503 } else if (c >= '0' && c <= '9') {
6504 return c - '0' + 52;
6505 } else if (c == '+') {
6506 return 62;
6507 } else if (c == '/') {
6508 return 63;
6509 }
6510 return -1;
6511}
6512
6513static int8_t qprint_map[] = {
6514 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6515 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6516 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6517 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
6518 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6519 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6520 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6521 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6522 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6523 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6524 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6525 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6526 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6527 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6528 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
6529 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
6530};
6531
6532/* Decode MIME encoded word as defined in RFC 2047 */
6533static unsigned char* mime_header_decode_encoded_word(unsigned char *p, unsigned char *e, const mbfl_encoding *outcode, mb_convert_buf *outbuf, unsigned int *state)
6534{
6535 if ((e - p) < 6) {
6536 return NULL;
6537 }
6538
6539 ZEND_ASSERT(p[0] == '=');
6540 ZEND_ASSERT(p[1] == '?');
6541 p += 2;
6542
6543 unsigned char *charset = p;
6544 unsigned char *charset_end = memchr(charset, '?', e - charset);
6545 if (charset_end == NULL) {
6546 return NULL;
6547 }
6548
6549 unsigned char *encoding = charset_end + 1;
6550 p = encoding + 1;
6551 if (p >= e || *p++ != '?') {
6552 return NULL;
6553 }
6554
6555 char *charset_name = estrndup((const char*)charset, charset_end - charset);
6556 const mbfl_encoding *incode = mbfl_name2encoding(charset_name);
6557 efree(charset_name);
6558 if (incode == NULL) {
6559 return NULL;
6560 }
6561
6562 unsigned char *end_marker = (unsigned char*)zend_memnstr((const char*)p, "?=", 2, (const char*)e);
6563 if (end_marker) {
6564 e = end_marker;
6565 } else if (p < e && *(e-1) == '?') {
6566 /* If encoded word is not properly terminated, but last byte is '?',
6567 * take that as a terminator (legacy behavior) */
6568 e--;
6569 }
6570
6571 unsigned char *buf = emalloc(e - p), *bufp = buf;
6572 if (*encoding == 'Q' || *encoding == 'q') {
6573 /* Fill `buf` with bytes from decoding QPrint */
6574 while (p < e) {
6575 unsigned char c = *p++;
6576 if (c == '_') {
6577 *bufp++ = ' ';
6578 continue;
6579 } else if (c == '=' && (e - p) >= 2) {
6580 unsigned char c2 = *p++;
6581 unsigned char c3 = *p++;
6582 if (qprint_map[c2] >= 0 && qprint_map[c3] >= 0) {
6583 *bufp++ = (qprint_map[c2] << 4) | (qprint_map[c3] & 0xF);
6584 continue;
6585 } else if (c2 == '\r') {
6586 if (c3 != '\n') {
6587 p--;
6588 }
6589 continue;
6590 } else if (c2 == '\n') {
6591 p--;
6592 continue;
6593 }
6594 }
6595 *bufp++ = c;
6596 }
6597 } else if (*encoding == 'B' || *encoding == 'b') {
6598 /* Fill `buf` with bytes from decoding Base64 */
6599 unsigned int bits = 0, cache = 0;
6600 while (p < e) {
6601 unsigned char c = *p++;
6602 if (c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == '=') {
6603 continue;
6604 }
6605 int8_t decoded = decode_base64(c);
6606 if (decoded == -1) {
6607 *bufp++ = '?';
6608 continue;
6609 }
6610 bits += 6;
6611 cache = (cache << 6) | (decoded & 0x3F);
6612 if (bits == 24) {
6613 *bufp++ = (cache >> 16) & 0xFF;
6614 *bufp++ = (cache >> 8) & 0xFF;
6615 *bufp++ = cache & 0xFF;
6616 bits = cache = 0;
6617 }
6618 }
6619 if (bits == 18) {
6620 *bufp++ = (cache >> 10) & 0xFF;
6621 *bufp++ = (cache >> 2) & 0xFF;
6622 } else if (bits == 12) {
6623 *bufp++ = (cache >> 4) & 0xFF;
6624 }
6625 } else {
6626 efree(buf);
6627 return NULL;
6628 }
6629
6630 size_t in_len = bufp - buf;
6631 uint32_t wchar_buf[128];
6632
6633 bufp = buf;
6634 while (in_len) {
6635 size_t out_len = incode->to_wchar(&bufp, &in_len, wchar_buf, 128, state);
6636 ZEND_ASSERT(out_len <= 128);
6637 outcode->from_wchar(wchar_buf, out_len, outbuf, false);
6638 }
6639
6640 efree(buf);
6641 return e + 2;
6642}
6643
6644static zend_string* mb_mime_header_decode(zend_string *input, const mbfl_encoding *outcode)
6645{
6646 unsigned char *p = (unsigned char*)ZSTR_VAL(input), *e = p + ZSTR_LEN(input);
6647 unsigned int state = 0;
6648 bool space_pending = false;
6649
6651 mb_convert_buf_init(&buf, ZSTR_LEN(input), '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
6652
6653 while (p < e) {
6654 unsigned char c = *p;
6655
6656 if (c == '=' && *(p + 1) == '?' && (e - p) >= 6) {
6657 /* Does this look like a MIME encoded word? If so, try to decode it as one */
6658 unsigned char *incode_end = memchr(p + 2, '?', e - p - 2);
6659 if (incode_end && (e - incode_end) >= 3) {
6660 unsigned char *temp = mime_header_decode_encoded_word(p, e, outcode, &buf, &state);
6661 if (temp) {
6662 p = temp;
6663 /* Decoding of MIME encoded word was successful;
6664 * Try to collapse a run of whitespace */
6665 if (p < e && (*p == '\n' || *p == '\r')) {
6666 do {
6667 p++;
6668 } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6669 /* We will only actually output a space if this is not immediately followed
6670 * by another valid encoded word */
6671 space_pending = true;
6672 }
6673 continue;
6674 }
6675 }
6676 }
6677
6678 if (space_pending) {
6679 uint32_t space = ' ';
6680 outcode->from_wchar(&space, 1, &buf, false);
6681 space_pending = false;
6682 }
6683
6684 /* Consume a run of plain ASCII characters */
6685 if (c != '\n' && c != '\r') {
6686 unsigned char *end = p + 1;
6687 while (end < e && (*end != '=' && *end != '\n' && *end != '\r')) {
6688 end++;
6689 }
6690 uint32_t wchar_buf[128];
6691 size_t in_len = end - p;
6692 while (in_len) {
6693 size_t out_len = mbfl_encoding_ascii.to_wchar(&p, &in_len, wchar_buf, 128, &state);
6694 ZEND_ASSERT(out_len <= 128);
6695 outcode->from_wchar(wchar_buf, out_len, &buf, false);
6696 }
6697 }
6698 /* Collapse a run of whitespace into a single space */
6699 if (p < e && (*p == '\n' || *p == '\r')) {
6700 do {
6701 p++;
6702 } while (p < e && (*p == '\n' || *p == '\r' || *p == '\t' || *p == ' '));
6703 if (p < e) {
6704 /* Emulating legacy behavior of mb_decode_mimeheader here;
6705 * a run of whitespace is not converted to a space at the very
6706 * end of the input string */
6707 uint32_t space = ' ';
6708 outcode->from_wchar(&space, 1, &buf, false);
6709 }
6710 }
6711 }
6712
6713 outcode->from_wchar(NULL, 0, &buf, true);
6714
6715 return mb_convert_buf_result(&buf, outcode);
6716}
6717
6719{
6720 zend_string *str;
6721
6723 Z_PARAM_STR(str)
6725
6726 RETURN_STR(mb_mime_header_decode(str, MBSTRG(current_internal_encoding)));
6727}
SAPI_API int sapi_register_treat_data(void(*treat_data)(int arg, char *str, zval *destArray))
Definition SAPI.c:982
SAPI_API void sapi_unregister_post_entry(const sapi_post_entry *post_entry)
Definition SAPI.c:962
SAPI_API int sapi_register_post_entries(const sapi_post_entry *post_entries)
Definition SAPI.c:933
#define SAPI_DEFAULT_MIMETYPE
Definition SAPI.h:307
#define sapi_add_header(a, b, c)
Definition SAPI.h:204
#define SG(v)
Definition SAPI.h:160
struct _sapi_post_entry sapi_post_entry
Definition SAPI.h:59
size_t len
Definition apprentice.c:174
bool exception
Definition assert.c:30
count(Countable|array $value, int $mode=COUNT_NORMAL)
strchr(string $haystack, string $needle, bool $before_needle=false)
char s[4]
Definition cdf.c:77
uint32_t v
Definition cdf.c:1237
#define FIRST_DOUBLEWIDTH_CODEPOINT
Definition eaw_table.h:17
int begin
Definition eaw_table.h:20
PHPAPI zend_string * php_escape_shell_cmd(const zend_string *unescaped_cmd)
Definition exec.c:280
error($message)
Definition ext_skel.php:22
zend_ffi_type * type
Definition ffi.c:3812
zval * zv
Definition ffi.c:3975
zend_long n
Definition ffi.c:4979
new_type size
Definition ffi.c:4365
zend_string * res
Definition ffi.c:4692
memcpy(ptr1, ptr2, size)
zval * val
Definition ffi.c:4262
HashTable * ht
Definition ffi.c:4838
buf start
Definition ffi.c:4687
ffi persistent
Definition ffi.c:3633
zend_ffi_ctype_name_buf buf
Definition ffi.c:4685
#define __attribute__(a)
Definition file.h:131
zend_long offset
char * mode
size_t filename_len
#define SIZE_MAX
Definition funcs.c:51
#define NULL
Definition gdcache.h:45
#define SUCCESS
Definition hash_sha3.c:261
again j
foreach($dp as $el) foreach( $dp as $el) if( $pass2< 2) echo ""
enum entity_charset charset
Definition html_tables.h:39
PHPAPI ZEND_COLD void php_info_print_table_header(int num_cols,...)
Definition info.c:1133
PHPAPI bool php_mail(const char *to, const char *subject, const char *message, const char *headers, const char *extra_cmd)
Definition mail.c:439
PHPAPI zend_string * php_mail_build_headers(HashTable *headers)
Definition mail.c:207
PHPAPI void(* php_internal_encoding_changed)(void)
Definition main.c:584
PHPAPI const char * php_get_output_encoding(void)
Definition main.c:575
PHPAPI const char * php_get_input_encoding(void)
Definition main.c:566
PHPAPI ZEND_COLD void php_error_docref(const char *docref, int type, const char *format,...)
Definition main.c:1173
PHPAPI const char * php_get_internal_encoding(void)
Definition main.c:557
const mbfl_encoding * _php_mb_encoding_handler_ex(const php_mb_encoding_handler_info_t *info, zval *array_ptr, char *res)
Definition mb_gpc.c:165
struct _php_mb_encoding_handler_info_t php_mb_encoding_handler_info_t
mbfl_string * mbfl_strcut(mbfl_string *string, mbfl_string *result, size_t from, size_t length)
Definition mbfilter.c:100
#define MBFL_ERROR_OFFSET
Definition mbfilter.h:136
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG
Definition mbfilter.h:117
#define MBFL_ERROR_ENCODING
Definition mbfilter.h:134
#define MBFL_SUBSTR_UNTIL_END
Definition mbfilter.h:141
#define MBFL_VERSION_MAJOR
Definition mbfilter.h:111
#define MBFL_VERSION_TEENY
Definition mbfilter.h:113
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8
Definition mbfilter.h:119
#define MBFL_ERROR_NOT_FOUND
Definition mbfilter.h:133
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE
Definition mbfilter.h:115
#define MBFL_VERSION_MINOR
Definition mbfilter.h:112
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY
Definition mbfilter.h:118
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR
Definition mbfilter.h:116
const mbfl_encoding mbfl_encoding_8bit
const mbfl_encoding mbfl_encoding_base64
const mbfl_encoding mbfl_encoding_sjis_mac
const mbfl_encoding mbfl_encoding_html_ent
const mbfl_encoding mbfl_encoding_pass
const mbfl_encoding mbfl_encoding_qprint
const mbfl_encoding mbfl_encoding_ascii
const mbfl_encoding mbfl_encoding_ucs4be
const mbfl_encoding mbfl_encoding_utf16be
const mbfl_encoding mbfl_encoding_utf16le
const mbfl_encoding mbfl_encoding_utf8
const mbfl_encoding mbfl_encoding_uuencode
#define MBFL_ENCTYPE_SBCS
Definition mbfl_consts.h:34
#define MBFL_ENCTYPE_GL_UNSAFE
Definition mbfl_consts.h:37
#define MBFL_ENCTYPE_WCS4
Definition mbfl_consts.h:36
#define MBFL_ENCTYPE_WCS2
Definition mbfl_consts.h:35
#define MBFL_BAD_INPUT
Definition mbfl_consts.h:45
zend_string * mb_fast_convert(unsigned char *in, size_t in_len, const mbfl_encoding *from, const mbfl_encoding *to, uint32_t replacement_char, unsigned int error_mode, unsigned int *num_errors)
const mbfl_encoding ** mbfl_get_supported_encodings(void)
const mbfl_encoding * mbfl_name2encoding_ex(const char *name, size_t name_len)
const mbfl_encoding * mbfl_name2encoding(const char *name)
const mbfl_encoding * mbfl_no2encoding(enum mbfl_no_encoding no_encoding)
const char * mbfl_encoding_preferred_mime_name(const mbfl_encoding *encoding)
const char * mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding)
mbfl_no_encoding
@ mbfl_no_encoding_8859_9
@ mbfl_no_encoding_cp936
@ mbfl_no_encoding_7bit
@ mbfl_no_encoding_cp50220
@ mbfl_no_encoding_utf8
@ mbfl_no_encoding_charset_min
@ mbfl_no_encoding_cp50222
@ mbfl_no_encoding_invalid
@ mbfl_no_encoding_euc_kr
@ mbfl_no_encoding_euc_cn
@ mbfl_no_encoding_utf7imap
@ mbfl_no_encoding_jis
@ mbfl_no_encoding_euc_tw
@ mbfl_no_encoding_2022jpms
@ mbfl_no_encoding_uhc
@ mbfl_no_encoding_euc_jp
@ mbfl_no_encoding_base64
@ mbfl_no_encoding_koi8u
@ mbfl_no_encoding_cp866
@ mbfl_no_encoding_armscii8
@ mbfl_no_encoding_ascii
@ mbfl_no_encoding_cp1251
@ mbfl_no_encoding_qprint
@ mbfl_no_encoding_sjis
@ mbfl_no_encoding_8bit
@ mbfl_no_encoding_utf7
@ mbfl_no_encoding_big5
@ mbfl_no_encoding_utf8_sb
@ mbfl_no_encoding_koi8r
@ mbfl_no_encoding_cp1254
#define MB_CONVERT_BUF_STORE(buf, _out, _limit)
#define MB_CONVERT_BUF_ENSURE(buf, out, limit, needed)
#define MBSTRING_MIN_WCHAR_BUFSIZE
#define MB_CONVERT_BUF_LOAD(buf, _out, _limit)
const char * mbfl_no_language2name(enum mbfl_no_language no_language)
const mbfl_language * mbfl_no2language(enum mbfl_no_language no_language)
enum mbfl_no_language mbfl_name2no_language(const char *name)
struct _mbfl_language mbfl_language
mbfl_no_language
@ mbfl_no_language_korean
@ mbfl_no_language_turkish
@ mbfl_no_language_russian
@ mbfl_no_language_armenian
@ mbfl_no_language_ukrainian
@ mbfl_no_language_japanese
@ mbfl_no_language_invalid
@ mbfl_no_language_neutral
@ mbfl_no_language_uni
@ mbfl_no_language_traditional_chinese
@ mbfl_no_language_simplified_chinese
struct _mbfl_string mbfl_string
MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
Definition mbstring.c:5994
MBSTRING_API char * php_mb_safe_strrchr(const char *s, unsigned int c, size_t nbytes, const mbfl_encoding *enc)
Definition mbstring.c:6010
#define PHP_MBSTR_MAIL_MIME_HEADER4
#define MB_STRSTR
Definition mbstring.c:2151
struct _php_mb_nls_ident_list php_mb_nls_ident_list
mb_trim_mode
Definition mbstring.c:3016
@ MB_RTRIM
Definition mbstring.c:3018
@ MB_BOTH_TRIM
Definition mbstring.c:3019
@ MB_LTRIM
Definition mbstring.c:3017
#define MB_STRISTR
Definition mbstring.c:2153
#define DEC_ENTITY_MINLEN
Definition mbstring.c:4083
MBSTRING_API const mbfl_encoding * mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
Definition mbstring.c:3360
#define mb_fast_check_utf8
Definition mbstring.c:5261
#define HEX_ENTITY_MAXLEN
Definition mbstring.c:4086
char mb_convert_kana_flags[17]
Definition mbstring.c:3594
MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc)
Definition mbstring.c:6049
MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
Definition mbstring.c:4872
#define CRLF
Definition mbstring.c:4296
#define DEC_ENTITY_MAXLEN
Definition mbstring.c:4085
MBSTRING_API zend_string * php_mb_convert_encoding_ex(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding *from_encoding)
Definition mbstring.c:2710
MBSTRING_API zend_string * php_mb_convert_encoding(const char *input, size_t length, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
Definition mbstring.c:2718
#define HEX_ENTITY_MINLEN
Definition mbstring.c:4084
#define MBSTRING_HEADER_ENC_WCHAR_BUFSIZE
Definition mbstring.c:6151
#define PHP_MBSTR_MAIL_MIME_HEADER2
zend_module_entry mbstring_module_entry
Definition mbstring.c:198
MBSTRING_API HashTable * php_mb_convert_encoding_recursive(HashTable *input, const mbfl_encoding *to_encoding, const mbfl_encoding **from_encodings, size_t num_from_encodings)
Definition mbstring.c:2738
uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode)
#define MB_STRRCHR
Definition mbstring.c:2152
#define MB_STRRICHR
Definition mbstring.c:2154
#define PHP_MBSTR_MAIL_MIME_HEADER3
#define PHP_MBSTR_MAIL_MIME_HEADER1
#define MBSTRING_API
Definition mbstring.h:39
int filter_illegal_mode
Definition mbstring.h:92
size_t http_input_list_size
Definition mbstring.h:84
size_t current_detect_order_list_size
Definition mbstring.h:88
zend_string * last_used_encoding_name
Definition mbstring.h:107
uint32_t filter_illegal_substchar
Definition mbstring.h:93
enum mbfl_no_language language
Definition mbstring.h:96
const mbfl_encoding ** current_detect_order_list
Definition mbstring.h:87
const mbfl_encoding ** http_input_list
Definition mbstring.h:83
const mbfl_encoding * internal_encoding
Definition mbstring.h:74
const mbfl_encoding ** detect_order_list
Definition mbstring.h:85
size_t detect_order_list_size
Definition mbstring.h:86
uint32_t current_filter_illegal_substchar
Definition mbstring.h:95
const mbfl_encoding * http_output_encoding
Definition mbstring.h:76
const mbfl_encoding * http_input_identify_string
Definition mbstring.h:82
#define PHP_MBSTRING_VERSION
Definition mbstring.h:23
int current_filter_illegal_mode
Definition mbstring.h:94
const mbfl_encoding * http_input_identify_post
Definition mbstring.h:80
HashTable * all_encodings_list
Definition mbstring.h:91
bool outconv_enabled
Definition mbstring.h:100
void * http_output_conv_mimetypes
Definition mbstring.h:102
char * internal_encoding_name
Definition mbstring.h:73
#define MBSTRG(v)
Definition mbstring.h:118
const mbfl_encoding * current_internal_encoding
Definition mbstring.h:75
const mbfl_encoding * http_input_identify
Definition mbstring.h:78
size_t illegalchars
Definition mbstring.h:99
enum mbfl_no_encoding * default_detect_order_list
Definition mbstring.h:89
bool http_input_set
Definition mbstring.h:112
bool http_output_set
Definition mbstring.h:111
bool internal_encoding_set
Definition mbstring.h:110
const mbfl_encoding * last_used_encoding
Definition mbstring.h:108
size_t default_detect_order_list_size
Definition mbstring.h:90
const mbfl_encoding * current_http_output_encoding
Definition mbstring.h:77
bool strict_detection
Definition mbstring.h:98
bool encoding_translation
Definition mbstring.h:97
const mbfl_encoding * http_input_identify_get
Definition mbstring.h:79
const mbfl_encoding * http_input_identify_cookie
Definition mbstring.h:81
unsigned int outconv_state
Definition mbstring.h:101
mb_get_info(string $type="all")
mb_send_mail(string $to, string $subject, string $message, array|string $additional_headers=[], ?string $additional_params=null)
mb_convert_variables(string $to_encoding, array|string $from_encoding, mixed &$var, mixed &... $vars)
mb_strtoupper(string $string, ?string $encoding=null)
mb_trim(string $string, ?string $characters=null, ?string $encoding=null)
mb_strcut(string $string, int $start, ?int $length=null, ?string $encoding=null)
mb_strimwidth(string $string, int $start, int $width, string $trim_marker="", ?string $encoding=null)
mb_output_handler(string $string, int $status)
mb_encode_numericentity(string $string, array $map, ?string $encoding=null, bool $hex=false)
mb_decode_mimeheader(string $string)
mb_internal_encoding(?string $encoding=null)
mb_lcfirst(string $string, ?string $encoding=null)
mb_convert_case(string $string, int $mode, ?string $encoding=null)
mb_scrub(string $string, ?string $encoding=null)
mb_strrpos(string $haystack, string $needle, int $offset=0, ?string $encoding=null)
mb_ucfirst(string $string, ?string $encoding=null)
mb_language(?string $language=null)
mb_strstr(string $haystack, string $needle, bool $before_needle=false, ?string $encoding=null)
mb_preferred_mime_name(string $encoding)
mb_detect_encoding(string $string, array|string|null $encodings=null, bool $strict=false)
mb_decode_numericentity(string $string, array $map, ?string $encoding=null)
mb_strripos(string $haystack, string $needle, int $offset=0, ?string $encoding=null)
mb_http_input(?string $type=null)
mb_check_encoding(array|string|null $value=null, ?string $encoding=null)
mb_substr_count(string $haystack, string $needle, ?string $encoding=null)
mb_encoding_aliases(string $encoding)
mb_chr(int $codepoint, ?string $encoding=null)
mb_str_pad(string $string, int $length, string $pad_string=" ", int $pad_type=STR_PAD_RIGHT, ?string $encoding=null)
mb_http_output(?string $encoding=null)
mb_stripos(string $haystack, string $needle, int $offset=0, ?string $encoding=null)
mb_detect_order(array|string|null $encoding=null)
mb_substitute_character(string|int|null $substitute_character=null)
mb_encode_mimeheader(string $string, ?string $charset=null, ?string $transfer_encoding=null, string $newline="\r\n", int $indent=0)
mb_strwidth(string $string, ?string $encoding=null)
mb_convert_encoding(array|string $string, string $to_encoding, array|string|null $from_encoding=null)
mb_substr(string $string, int $start, ?int $length=null, ?string $encoding=null)
mb_ltrim(string $string, ?string $characters=null, ?string $encoding=null)
mb_list_encodings()
mb_convert_kana(string $string, string $mode="KV", ?string $encoding=null)
mb_strtolower(string $string, ?string $encoding=null)
mb_parse_str(string $string, &$result)
mb_ord(string $string, ?string $encoding=null)
mb_rtrim(string $string, ?string $characters=null, ?string $encoding=null)
mb_strpos(string $haystack, string $needle, int $offset=0, ?string $encoding=null)
mb_stristr(string $haystack, string $needle, bool $before_needle=false, ?string $encoding=null)
mb_strrchr(string $haystack, string $needle, bool $before_needle=false, ?string $encoding=null)
mb_strlen(string $string, ?string $encoding=null)
mb_strrichr(string $haystack, string $needle, bool $before_needle=false, ?string $encoding=null)
mb_str_split(string $string, int $length=1, ?string $encoding=null)
lu_byte right
Definition minilua.c:4267
lu_byte left
Definition minilua.c:4266
#define next(ls)
Definition minilua.c:2661
#define PCRE2_ZERO_TERMINATED
Definition pcre2.h:481
#define pcre2_code_free
Definition pcre2.h:852
#define PCRE2_UCHAR
Definition pcre2.h:819
#define pcre2_get_error_message
Definition pcre2.h:866
#define pcre2_code
Definition pcre2.h:822
#define pcre2_compile
Definition pcre2.h:853
#define pcre2_match
Definition pcre2.h:881
#define PCRE2_SIZE
Definition pcre2.h:479
#define PCRE2_SPTR
Definition pcre2.h:820
#define PCRE2_CASELESS
Definition pcre2.h:122
#define pcre2_match_data
Definition pcre2.h:844
#define memmove(a, b, c)
php_info_print_table_start()
Definition info.c:1064
php_info_print_table_row(2, "PDO Driver for Firebird", "enabled")
php_info_print_table_end()
Definition info.c:1074
#define PHP_GINIT
Definition php.h:397
#define PHP_FUNCTION
Definition php.h:364
#define PHP_MSHUTDOWN_FUNCTION
Definition php.h:401
#define PHP_MINFO
Definition php.h:396
#define PHP_MINIT_FUNCTION
Definition php.h:400
#define PHP_RINIT
Definition php.h:394
#define PHP_MSHUTDOWN
Definition php.h:393
#define PHP_MINFO_FUNCTION
Definition php.h:404
#define PHP_GINIT_FUNCTION
Definition php.h:405
#define PHP_RSHUTDOWN
Definition php.h:395
#define PHP_RINIT_FUNCTION
Definition php.h:402
#define PHP_RSHUTDOWN_FUNCTION
Definition php.h:403
#define PHP_GSHUTDOWN_FUNCTION
Definition php.h:406
#define PHP_MINIT
Definition php.h:392
#define PHP_MODULE_GLOBALS
Definition php.h:408
#define PHP_GSHUTDOWN
Definition php.h:398
#define DEFAULT_POST_CONTENT_TYPE
int line
Definition php_ffi.h:54
unsigned const char * end
Definition php_ffi.h:51
unsigned const char * pos
Definition php_ffi.h:52
#define PG(v)
Definition php_globals.h:31
#define PHP_INI_PERDIR
Definition php_ini.h:42
#define PHP_INI_STAGE_RUNTIME
Definition php_ini.h:75
#define PHP_INI_ALL
Definition php_ini.h:45
#define PHP_INI_USER
Definition php_ini.h:41
#define PHP_INI_BEGIN
Definition php_ini.h:52
#define STD_PHP_INI_ENTRY
Definition php_ini.h:64
#define PHP_INI_ENTRY
Definition php_ini.h:62
#define STD_PHP_INI_BOOLEAN
Definition php_ini.h:66
#define PHP_INI_MH
Definition php_ini.h:49
#define PHP_INI_SYSTEM
Definition php_ini.h:43
#define PHP_INI_END
Definition php_ini.h:53
#define PHP_OUTPUT_HANDLER_END
Definition php_output.h:29
#define PHP_OUTPUT_HANDLER_START
Definition php_output.h:24
struct php_pcntl_pending_signal * head
Definition php_pcntl.h:47
PHPAPI pcre2_match_context * php_pcre_mctx(void)
Definition php_pcre.c:3054
PHPAPI pcre2_compile_context * php_pcre_cctx(void)
Definition php_pcre.c:3064
PHPAPI pcre2_match_data * php_pcre_create_match_data(uint32_t capture_count, pcre2_code *re)
Definition php_pcre.c:906
PHPAPI void php_pcre_free_match_data(pcre2_match_data *match_data)
Definition php_pcre.c:928
PHPAPI char * php_strtok_r(char *s, const char *delim, char **last)
Definition reentrancy.c:263
unsigned char key[REFLECTION_KEY_LEN]
xmlCharEncodingHandlerPtr encoding
Definition php_soap.h:170
#define PHP_STR_PAD_RIGHT
Definition php_string.h:71
#define PHP_STR_PAD_BOTH
Definition php_string.h:72
#define PHP_STR_PAD_LEFT
Definition php_string.h:70
PHPAPI zend_string * php_trim(zend_string *str, const char *what, size_t what_len, int mode)
Definition string.c:628
MBSTRING_API zend_string * php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar)
php_case_mode
Definition php_unicode.h:80
@ PHP_UNICODE_CASE_TITLE
Definition php_unicode.h:83
@ PHP_UNICODE_CASE_FOLD_SIMPLE
Definition php_unicode.h:88
@ PHP_UNICODE_CASE_LOWER
Definition php_unicode.h:82
@ PHP_UNICODE_CASE_UPPER
Definition php_unicode.h:81
@ PHP_UNICODE_CASE_MODE_MAX
Definition php_unicode.h:89
#define PARSE_STRING
SAPI_API void php_rfc1867_set_multibyte_callbacks(php_rfc1867_encoding_translation_t encoding_translation, php_rfc1867_get_detect_order_t get_detect_order, php_rfc1867_set_input_encoding_t set_input_encoding, php_rfc1867_getword_t getword, php_rfc1867_getword_conf_t getword_conf, php_rfc1867_basename_t basename)
Definition rfc1867.c:1289
#define MULTIPART_CONTENT_TYPE
Definition rfc1867.h:22
const char * endptr
Definition session.c:1021
p
Definition session.c:1105
#define spprintf
Definition spprintf.h:29
enum mbfl_no_encoding mail_header_encoding
enum mbfl_no_encoding mail_charset
enum mbfl_no_encoding mail_body_encoding
const mbfl_encoding ** from_encodings
Definition mb_gpc.h:26
const mbfl_encoding * to_encoding
Definition mb_gpc.h:25
enum mbfl_no_language lang
Definition mbstring.c:109
enum mbfl_no_encoding * list
Definition mbstring.c:110
Definition file.h:177
size_t in_len
Definition mbstring.c:3235
const unsigned char * in
Definition mbstring.c:3234
unsigned int state
Definition mbstring.c:3237
float multiplier
Definition mbstring.c:3238
const mbfl_encoding * enc
Definition mbstring.c:3233
uint64_t demerits
Definition mbstring.c:3236
unsigned char * out
zend_string * str
mb_to_wchar_fn to_wchar
const unsigned char * mblen_table
mb_check_fn check
const char * name
unsigned int flag
const char * mime_name
enum mbfl_no_encoding no_encoding
mb_from_wchar_fn from_wchar
$obj a
Definition test.php:84
#define MBFL_ZEN2HAN_HIRAGANA
#define MBFL_HAN2ZEN_ALPHA
#define MBFL_HAN2ZEN_GLUE
#define MBFL_ZENKAKU_KATA2HIRA
#define MBFL_ZEN2HAN_ALPHA
#define MBFL_HAN2ZEN_ALL
#define MBFL_ZEN2HAN_ALL
#define MBFL_HAN2ZEN_NUMERIC
#define MBFL_HAN2ZEN_KATAKANA
#define MBFL_ZEN2HAN_NUMERIC
#define MBFL_HAN2ZEN_HIRAGANA
#define MBFL_ZENKAKU_HIRA2KATA
#define MBFL_ZEN2HAN_KATAKANA
ZEND_API ZEND_COLD void zend_throw_error(zend_class_entry *exception_ce, const char *format,...)
Definition zend.c:1772
ZEND_API ZEND_COLD void zend_value_error(const char *format,...)
Definition zend.c:1849
#define ZEND_TSRMLS_CACHE_UPDATE()
Definition zend.h:69
#define INTERNAL_FUNCTION_PARAMETERS
Definition zend.h:49
#define ZEND_TSRMLS_CACHE_DEFINE()
Definition zend.h:68
#define INTERNAL_FUNCTION_PARAM_PASSTHRU
Definition zend.h:50
ZEND_API zend_result add_next_index_stringl(zval *arg, const char *str, size_t length)
Definition zend_API.c:2195
ZEND_API ZEND_COLD void zend_argument_must_not_be_empty_error(uint32_t arg_num)
Definition zend_API.c:443
ZEND_API ZEND_COLD void zend_argument_value_error(uint32_t arg_num, const char *format,...)
Definition zend_API.c:433
ZEND_API zend_result add_next_index_string(zval *arg, const char *str)
Definition zend_API.c:2186
ZEND_API zend_result add_next_index_str(zval *arg, zend_string *str)
Definition zend_API.c:2177
#define Z_PARAM_PATH_STR(dest)
Definition zend_API.h:2041
#define ZEND_NUM_ARGS()
Definition zend_API.h:530
#define Z_PARAM_ARRAY_HT_OR_STR_OR_NULL(dest_ht, dest_str)
Definition zend_API.h:2154
#define Z_PARAM_PATH_OR_NULL(dest, dest_len)
Definition zend_API.h:2029
#define RETURN_STRING(s)
Definition zend_API.h:1043
#define ZEND_PARSE_PARAMETERS_END()
Definition zend_API.h:1641
#define RETURN_FALSE
Definition zend_API.h:1058
#define RETVAL_STRING(s)
Definition zend_API.h:1017
#define Z_PARAM_STR_OR_NULL(dest)
Definition zend_API.h:2089
#define ZEND_PARSE_PARAMETERS_NONE()
Definition zend_API.h:1623
#define RETURN_NULL()
Definition zend_API.h:1036
#define ZVAL_STRING(z, s)
Definition zend_API.h:956
#define ZEND_DECLARE_MODULE_GLOBALS(module_name)
Definition zend_API.h:268
#define array_init_size(arg, size)
Definition zend_API.h:538
#define RETURN_ARR(r)
Definition zend_API.h:1050
#define Z_PARAM_OPTIONAL
Definition zend_API.h:1667
#define ZEND_GET_MODULE(name)
Definition zend_API.h:241
#define Z_PARAM_STRING(dest, dest_len)
Definition zend_API.h:2071
#define RETVAL_ARR(r)
Definition zend_API.h:1024
#define Z_PARAM_STR(dest)
Definition zend_API.h:2086
#define Z_PARAM_STRING_OR_NULL(dest, dest_len)
Definition zend_API.h:2074
#define Z_PARAM_STR_OR_LONG_OR_NULL(dest_str, dest_long, is_null)
Definition zend_API.h:2168
#define ZEND_PARSE_PARAMETERS_START(min_num_args, max_num_args)
Definition zend_API.h:1620
#define Z_PARAM_ARRAY_HT_OR_STR(dest_ht, dest_str)
Definition zend_API.h:2151
#define Z_PARAM_LONG(dest)
Definition zend_API.h:1896
#define Z_PARAM_VARIADIC(spec, dest, dest_num)
Definition zend_API.h:2124
#define RETURN_LONG(l)
Definition zend_API.h:1037
#define RETURN_BOOL(b)
Definition zend_API.h:1035
#define RETURN_NEW_STR(s)
Definition zend_API.h:1041
#define RETURN_THROWS()
Definition zend_API.h:1060
#define HASH_OF(p)
Definition zend_API.h:1062
#define RETVAL_TRUE
Definition zend_API.h:1033
#define Z_PARAM_ARRAY_HT(dest)
Definition zend_API.h:1852
#define RETURN_STR(s)
Definition zend_API.h:1039
#define RETVAL_BOOL(b)
Definition zend_API.h:1009
#define Z_PARAM_LONG_OR_NULL(dest, is_null)
Definition zend_API.h:1899
#define RETVAL_LONG(l)
Definition zend_API.h:1011
#define Z_PARAM_BOOL(dest)
Definition zend_API.h:1726
#define RETURN_EMPTY_ARRAY()
Definition zend_API.h:1051
#define RETURN_EMPTY_STRING()
Definition zend_API.h:1047
#define Z_PARAM_PATH(dest, dest_len)
Definition zend_API.h:2026
#define Z_PARAM_ZVAL(dest)
Definition zend_API.h:2100
#define RETVAL_STR(s)
Definition zend_API.h:1013
#define RETVAL_FALSE
Definition zend_API.h:1032
#define RETURN_TRUE
Definition zend_API.h:1059
#define RETVAL_STRINGL(s, l)
Definition zend_API.h:1018
#define Z_PARAM_PATH_STR_OR_NULL(dest)
Definition zend_API.h:2044
#define RETURN_STR_COPY(s)
Definition zend_API.h:1042
#define array_init(arg)
Definition zend_API.h:537
#define estrndup(s, length)
Definition zend_alloc.h:165
#define ecalloc(nmemb, size)
Definition zend_alloc.h:158
#define efree(ptr)
Definition zend_alloc.h:155
#define estrdup(s)
Definition zend_alloc.h:164
#define pefree(ptr, persistent)
Definition zend_alloc.h:191
#define safe_emalloc(nmemb, size, offset)
Definition zend_alloc.h:154
#define pecalloc(nmemb, size, persistent)
Definition zend_alloc.h:200
#define emalloc(size)
Definition zend_alloc.h:151
struct _zval_struct zval
strlen(string $string)
strncmp(string $string1, string $string2, int $length)
zend_string_release_ex(func->internal_function.function_name, 0)
zval * args
struct _zend_property_info zend_property_info
#define strncasecmp(s1, s2, n)
#define strcasecmp(s1, s2)
#define ZEND_API
#define snprintf
#define ZEND_NO_SANITIZE_ADDRESS
#define E_WARNING
Definition zend_errors.h:24
#define E_DEPRECATED
Definition zend_errors.h:37
ZEND_API void(ZEND_FASTCALL *zend_touch_vm_stack_data)(void *vm_stack_data)
#define ZEND_REF_DEL_TYPE_SOURCE(ref, source)
#define EG(v)
ZEND_API void ZEND_FASTCALL zend_hash_destroy(HashTable *ht)
Definition zend_hash.c:1727
ZEND_API zval *ZEND_FASTCALL zend_hash_str_find(const HashTable *ht, const char *str, size_t len)
Definition zend_hash.c:2689
ZEND_API zval *ZEND_FASTCALL zend_hash_index_add(HashTable *ht, zend_ulong h, zval *pData)
Definition zend_hash.c:1209
ZEND_API zval *ZEND_FASTCALL zend_hash_next_index_insert(HashTable *ht, zval *pData)
Definition zend_hash.c:1224
ZEND_API zval *ZEND_FASTCALL zend_hash_index_add_new(HashTable *ht, zend_ulong h, zval *pData)
Definition zend_hash.c:1214
ZEND_API void ZEND_FASTCALL zend_array_destroy(HashTable *ht)
Definition zend_hash.c:1808
ZEND_API zval *ZEND_FASTCALL zend_hash_update(HashTable *ht, zend_string *key, zval *pData)
Definition zend_hash.c:997
ZEND_API zval *ZEND_FASTCALL zend_hash_add(HashTable *ht, zend_string *key, zval *pData)
Definition zend_hash.c:992
#define zend_hash_init(ht, nSize, pHashFunction, pDestructor, persistent)
Definition zend_hash.h:108
#define zend_new_array(size)
Definition zend_hash.h:338
#define ZEND_HASH_FOREACH_KEY_VAL(ht, _h, _key, _val)
Definition zend_hash.h:1181
#define ZEND_HASH_FOREACH_VAL_IND(ht, _val)
Definition zend_hash.h:1110
#define ZEND_HASH_FOREACH_END()
Definition zend_hash.h:1086
#define ZVAL_EMPTY_ARRAY(z)
Definition zend_hash.h:87
#define ZEND_HASH_FOREACH_VAL(ht, _val)
Definition zend_hash.h:1102
ZEND_API zend_string * zend_ini_str_ex(const char *name, size_t name_length, bool orig, bool *exists)
Definition zend_ini.c:521
ZEND_API zend_result zend_alter_ini_entry(zend_string *name, zend_string *new_value, int modify_type, int stage)
Definition zend_ini.c:325
ZEND_API zend_string * zend_ini_str(const char *name, size_t name_length, bool orig)
Definition zend_ini.c:545
#define UNREGISTER_INI_ENTRIES()
Definition zend_ini.h:204
#define REGISTER_INI_ENTRIES()
Definition zend_ini.h:203
#define DISPLAY_INI_ENTRIES()
Definition zend_ini.h:205
int32_t zend_long
Definition zend_long.h:42
#define ZEND_LONG_MIN
Definition zend_long.h:46
#define ZEND_LONG_FMT
Definition zend_long.h:87
#define ZEND_LONG_MAX
Definition zend_long.h:45
struct _zend_string zend_string
#define INIT_FUNC_ARGS_PASSTHRU
#define ZEND_MOD_END
#define ZEND_MODULE_INFO_FUNC_ARGS_PASSTHRU
struct _zend_module_dep zend_module_dep
struct _zend_module_entry zend_module_entry
#define ZEND_MOD_REQUIRED(name)
#define STANDARD_MODULE_PROPERTIES_EX
#define STANDARD_MODULE_HEADER_EX
ZEND_API void zend_multibyte_restore_functions(void)
ZEND_API zend_result zend_multibyte_set_internal_encoding(const zend_encoding *encoding)
ZEND_API zend_result zend_multibyte_set_functions(const zend_multibyte_functions *functions)
struct _zend_encoding zend_encoding
struct _zend_multibyte_functions zend_multibyte_functions
ZEND_API zend_long ZEND_FASTCALL zval_try_get_long(const zval *op, bool *failed)
ZEND_API void ZEND_FASTCALL zend_str_tolower(char *str, size_t length)
int last
char * alloca()
#define ZEND_INTRIN_AVX2_FUNC_DECL(func)
#define ZEND_ATTRIBUTE_UNUSED
#define MIN(a, b)
#define ZEND_FALLTHROUGH
#define ZEND_ASSERT(c)
#define ZEND_UNREACHABLE()
#define ZEND_VOIDP(ptr)
#define EMPTY_SWITCH_DEFAULT_CASE()
#define UNEXPECTED(condition)
#define MAX(a, b)
ZEND_API zend_string * zend_string_concat2(const char *str1, size_t str1_len, const char *str2, size_t str2_len)
ZEND_API zend_string * zend_empty_string
Definition zend_string.c:51
#define ZSTR_IS_VALID_UTF8(s)
Definition zend_string.h:85
#define ZSTR_IS_INTERNED(s)
Definition zend_string.h:84
#define ZSTR_VAL(zstr)
Definition zend_string.h:68
#define ZSTR_INIT_LITERAL(s, persistent)
#define ZSTR_MAX_LEN
#define zend_string_equals_ci(s1, s2)
#define ZSTR_LEN(zstr)
Definition zend_string.h:69
#define zend_string_equals_literal_ci(str, c)
#define ZSTR_CHAR(c)
#define Z_TYPE_P(zval_p)
Definition zend_types.h:660
#define IS_TRUE
Definition zend_types.h:603
#define ZVAL_STR(z, s)
#define Z_ISREF_P(zval_p)
Definition zend_types.h:954
#define Z_REFVAL_P(zval_p)
#define IS_FALSE
Definition zend_types.h:602
#define Z_STRVAL_P(zval_p)
Definition zend_types.h:975
#define Z_ARRVAL_P(zval_p)
Definition zend_types.h:987
#define ZVAL_TRUE(z)
#define ZVAL_DEREF(z)
#define IS_STRING
Definition zend_types.h:606
#define Z_REFCOUNTED_P(zval_p)
Definition zend_types.h:921
struct _zend_array HashTable
Definition zend_types.h:386
#define Z_OBJ_P(zval_p)
Definition zend_types.h:990
#define IS_ARRAY
Definition zend_types.h:607
#define IS_DOUBLE
Definition zend_types.h:605
#define Z_STR_P(zval_p)
Definition zend_types.h:972
#define IS_STR_VALID_UTF8
Definition zend_types.h:820
#define GC_DELREF(p)
Definition zend_types.h:710
#define GC_ADDREF(p)
Definition zend_types.h:709
#define Z_UNPROTECT_RECURSION_P(zv)
Definition zend_types.h:889
#define Z_STRLEN_P(zval_p)
Definition zend_types.h:978
#define GC_TRY_UNPROTECT_RECURSION(p)
Definition zend_types.h:880
#define IS_NULL
Definition zend_types.h:601
@ FAILURE
Definition zend_types.h:61
#define IS_OBJECT
Definition zend_types.h:608
#define GC_TRY_PROTECT_RECURSION(p)
Definition zend_types.h:876
#define IS_LONG
Definition zend_types.h:604
#define ZVAL_ARR(z, a)
#define IS_REFERENCE
Definition zend_types.h:610
#define ZVAL_COPY(z, v)
#define GC_UNPROTECT_RECURSION(p)
Definition zend_types.h:872
#define Z_INDIRECT_P(zval_p)
#define Z_REF_P(zval_p)
#define Z_PROTECT_RECURSION_P(zv)
Definition zend_types.h:888
ZEND_RESULT_CODE zend_result
Definition zend_types.h:64
#define SEPARATE_ARRAY(zv)
#define GC_IS_RECURSIVE(p)
Definition zend_types.h:865
#define IS_INDIRECT
Definition zend_types.h:623
#define GC_ADD_FLAGS(p, flags)
Definition zend_types.h:759
#define Z_IS_RECURSIVE_P(zv)
Definition zend_types.h:887
ZEND_API void zval_ptr_dtor(zval *zval_ptr)
#define ZVAL_PTR_DTOR
zval retval
zval * return_value
uint32_t arg_num
zend_string * name
bool result
zval * ret
value
out($f, $s)