php-internal-docs 8.4.8
Unofficial docs for php/php-src
Loading...
Searching...
No Matches
html_document.c
Go to the documentation of this file.
1/*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Authors: Niels Dossche <nielsdos@php.net> |
14 +----------------------------------------------------------------------+
15*/
16
17#ifdef HAVE_CONFIG_H
18#include <config.h>
19#endif
20
21#include "php.h"
22#if defined(HAVE_LIBXML) && defined(HAVE_DOM)
23#include "php_dom.h"
24#include "infra.h"
25#include "html5_parser.h"
26#include "html5_serializer.h"
27#include "namespace_compat.h"
28#include "private_data.h"
29#include "dom_properties.h"
33
34/* Implementation defined, but as HTML5 defaults in all other cases to UTF-8, we'll do the same. */
35#define DOM_FALLBACK_ENCODING_ID LXB_ENCODING_UTF_8
36
37typedef struct dom_line_column_cache {
38 size_t last_line;
39 size_t last_column;
40 size_t last_offset;
41} dom_line_column_cache;
42
43typedef struct dom_lexbor_libxml2_bridge_application_data {
44 const char *input_name;
45 const lxb_codepoint_t *current_input_codepoints;
46 const char *current_input_characters;
47 size_t current_input_length;
48 size_t current_total_offset;
49 dom_line_column_cache cache_tokenizer;
50 bool html_no_implied;
51} dom_lexbor_libxml2_bridge_application_data;
52
53typedef struct dom_character_encoding_data {
54 const lxb_encoding_data_t *encoding_data;
55 size_t bom_shift;
56} dom_character_encoding_data;
57
58typedef zend_result (*dom_write_output)(void*, const char *, size_t);
59
60typedef struct dom_output_ctx {
61 const lxb_encoding_data_t *encoding_data;
62 const lxb_encoding_data_t *decoding_data;
63 lxb_encoding_encode_t *encode;
64 lxb_encoding_decode_t *decode;
65 lxb_codepoint_t *codepoints;
66 lxb_char_t *encoding_output;
67 void *output_data;
68 dom_write_output write_output;
69} dom_output_ctx;
70
71typedef struct dom_decoding_encoding_ctx {
72 /* We can skip some conversion if the input and output encoding are both UTF-8,
73 * we only have to validate and substitute replacement characters */
74 bool fast_path; /* Put first, near the encode & decode structures, for cache locality */
75 lxb_encoding_encode_t encode;
76 lxb_encoding_decode_t decode;
77 const lxb_encoding_data_t *encode_data;
78 const lxb_encoding_data_t *decode_data;
79 lxb_char_t encoding_output[4096];
80 lxb_codepoint_t codepoints[4096];
81} dom_decoding_encoding_ctx;
82
83/* https://dom.spec.whatwg.org/#dom-document-implementation */
85{
86 const uint32_t PROP_INDEX = 0;
87
88#if ZEND_DEBUG
89 zend_string *implementation_str = ZSTR_INIT_LITERAL("implementation", false);
91 zend_string_release_ex(implementation_str, false);
92 ZEND_ASSERT(OBJ_PROP_TO_NUM(prop_info->offset) == PROP_INDEX);
93#endif
94
95 zval *cached_implementation = OBJ_PROP_NUM(&obj->std, PROP_INDEX);
96 if (Z_ISUNDEF_P(cached_implementation)) {
97 php_dom_create_implementation(cached_implementation, true);
98 }
99
100 ZVAL_OBJ_COPY(retval, Z_OBJ_P(cached_implementation));
101
102 return SUCCESS;
103}
104
105static void dom_decoding_encoding_ctx_init(dom_decoding_encoding_ctx *ctx)
106{
107 ctx->decode_data = ctx->encode_data = lxb_encoding_data(LXB_ENCODING_UTF_8);
108 ctx->fast_path = true;
110 &ctx->encode,
111 ctx->encode_data,
112 ctx->encoding_output,
113 sizeof(ctx->encoding_output) / sizeof(*ctx->encoding_output)
114 );
117 &ctx->decode,
118 ctx->decode_data,
119 ctx->codepoints,
120 sizeof(ctx->codepoints) / sizeof(*ctx->codepoints)
121 );
123}
124
125static const char *dom_lexbor_tokenizer_error_code_to_string(lxb_html_tokenizer_error_id_t id)
126{
127 switch (id) {
128 case LXB_HTML_TOKENIZER_ERROR_ABCLOFEMCO: return "abrupt-closing-of-empty-comment";
129 case LXB_HTML_TOKENIZER_ERROR_ABDOPUID: return "abrupt-doctype-public-identifier";
130 case LXB_HTML_TOKENIZER_ERROR_ABDOSYID: return "abrupt-doctype-system-identifier";
131 case LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE: return "absence-of-digits-in-numeric-character-reference";
132 case LXB_HTML_TOKENIZER_ERROR_CDINHTCO: return "cdata-in-html-content";
133 case LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA: return "character-reference-outside-unicode-range";
134 case LXB_HTML_TOKENIZER_ERROR_COCHININST: return "control-character-in-input-stream";
135 case LXB_HTML_TOKENIZER_ERROR_COCHRE: return "control-character-reference";
136 case LXB_HTML_TOKENIZER_ERROR_ENTAWIAT: return "end-tag-with-attributes";
137 case LXB_HTML_TOKENIZER_ERROR_DUAT: return "duplicate-attribute";
138 case LXB_HTML_TOKENIZER_ERROR_ENTAWITRSO: return "end-tag-with-trailing-solidus";
139 case LXB_HTML_TOKENIZER_ERROR_EOBETANA: return "eof-before-tag-name";
140 case LXB_HTML_TOKENIZER_ERROR_EOINCD: return "eof-in-cdata";
141 case LXB_HTML_TOKENIZER_ERROR_EOINCO: return "eof-in-comment";
142 case LXB_HTML_TOKENIZER_ERROR_EOINDO: return "eof-in-doctype";
143 case LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE: return "eof-in-script-html-comment-like-text";
144 case LXB_HTML_TOKENIZER_ERROR_EOINTA: return "eof-in-tag";
145 case LXB_HTML_TOKENIZER_ERROR_INCLCO: return "incorrectly-closed-comment";
146 case LXB_HTML_TOKENIZER_ERROR_INOPCO: return "incorrectly-opened-comment";
147 case LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA: return "invalid-character-sequence-after-doctype-name";
148 case LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA: return "invalid-first-character-of-tag-name";
149 case LXB_HTML_TOKENIZER_ERROR_MIATVA: return "missing-attribute-value";
150 case LXB_HTML_TOKENIZER_ERROR_MIDONA: return "missing-doctype-name";
151 case LXB_HTML_TOKENIZER_ERROR_MIDOPUID: return "missing-doctype-public-identifier";
152 case LXB_HTML_TOKENIZER_ERROR_MIDOSYID: return "missing-doctype-system-identifier";
153 case LXB_HTML_TOKENIZER_ERROR_MIENTANA: return "missing-end-tag-name";
154 case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOPUID: return "missing-quote-before-doctype-public-identifier";
155 case LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID: return "missing-quote-before-doctype-system-identifier";
156 case LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE: return "missing-semicolon-after-character-reference";
157 case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOPUKE: return "missing-whitespace-after-doctype-public-keyword";
158 case LXB_HTML_TOKENIZER_ERROR_MIWHAFDOSYKE: return "missing-whitespace-after-doctype-system-keyword";
159 case LXB_HTML_TOKENIZER_ERROR_MIWHBEDONA: return "missing-whitespace-before-doctype-name";
160 case LXB_HTML_TOKENIZER_ERROR_MIWHBEAT: return "missing-whitespace-between-attributes";
161 case LXB_HTML_TOKENIZER_ERROR_MIWHBEDOPUANSYID: return "missing-whitespace-between-doctype-public-and-system-identifiers";
162 case LXB_HTML_TOKENIZER_ERROR_NECO: return "nested-comment";
163 case LXB_HTML_TOKENIZER_ERROR_NOCHRE: return "noncharacter-character-reference";
164 case LXB_HTML_TOKENIZER_ERROR_NOININST: return "noncharacter-in-input-stream";
165 case LXB_HTML_TOKENIZER_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus";
166 case LXB_HTML_TOKENIZER_ERROR_NUCHRE: return "null-character-reference";
167 case LXB_HTML_TOKENIZER_ERROR_SUCHRE: return "surrogate-character-reference";
168 case LXB_HTML_TOKENIZER_ERROR_SUININST: return "surrogate-in-input-stream";
169 case LXB_HTML_TOKENIZER_ERROR_UNCHAFDOSYID: return "unexpected-character-after-doctype-system-identifier";
170 case LXB_HTML_TOKENIZER_ERROR_UNCHINATNA: return "unexpected-character-in-attribute-name";
171 case LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA: return "unexpected-character-in-unquoted-attribute-value";
172 case LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA: return "unexpected-equals-sign-before-attribute-name";
173 case LXB_HTML_TOKENIZER_ERROR_UNNUCH: return "unexpected-null-character";
174 case LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA: return "unexpected-question-mark-instead-of-tag-name";
175 case LXB_HTML_TOKENIZER_ERROR_UNSOINTA: return "unexpected-solidus-in-tag";
176 case LXB_HTML_TOKENIZER_ERROR_UNNACHRE: return "unknown-named-character-reference";
177 default: return "unknown error";
178 }
179}
180
181static const char *dom_lexbor_tree_error_code_to_string(lxb_html_tree_error_id_t id)
182{
183 switch (id) {
184 case LXB_HTML_RULES_ERROR_UNTO: return "unexpected-token";
185 case LXB_HTML_RULES_ERROR_UNCLTO: return "unexpected-closed-token";
186 case LXB_HTML_RULES_ERROR_NUCH: return "null-character";
187 case LXB_HTML_RULES_ERROR_UNCHTO: return "unexpected-character-token";
188 case LXB_HTML_RULES_ERROR_UNTOININMO: return "unexpected-token-in-initial-mode";
189 case LXB_HTML_RULES_ERROR_BADOTOININMO: return "bad-doctype-token-in-initial-mode";
190 case LXB_HTML_RULES_ERROR_DOTOINBEHTMO: return "doctype-token-in-before-html-mode";
191 case LXB_HTML_RULES_ERROR_UNCLTOINBEHTMO: return "unexpected-closed-token-in-before-html-mode";
192 case LXB_HTML_RULES_ERROR_DOTOINBEHEMO: return "doctype-token-in-before-head-mode";
193 case LXB_HTML_RULES_ERROR_UNCLTOINBEHEMO: return "unexpected-closed_token-in-before-head-mode";
194 case LXB_HTML_RULES_ERROR_DOTOINHEMO: return "doctype-token-in-head-mode";
195 case LXB_HTML_RULES_ERROR_NOVOHTELSTTAWITRSO: return "non-void-html-element-start-tag-with-trailing-solidus";
196 case LXB_HTML_RULES_ERROR_HETOINHEMO: return "head-token-in-head-mode";
197 case LXB_HTML_RULES_ERROR_UNCLTOINHEMO: return "unexpected-closed-token-in-head-mode";
198 case LXB_HTML_RULES_ERROR_TECLTOWIOPINHEMO: return "template-closed-token-without-opening-in-head-mode";
199 case LXB_HTML_RULES_ERROR_TEELISNOCUINHEMO: return "template-element-is-not-current-in-head-mode";
200 case LXB_HTML_RULES_ERROR_DOTOINHENOMO: return "doctype-token-in-head-noscript-mode";
201 case LXB_HTML_RULES_ERROR_DOTOAFHEMO: return "doctype-token-after-head-mode";
202 case LXB_HTML_RULES_ERROR_HETOAFHEMO: return "head-token-after-head-mode";
203 case LXB_HTML_RULES_ERROR_DOTOINBOMO: return "doctype-token-in-body-mode";
204 case LXB_HTML_RULES_ERROR_BAENOPELISWR: return "bad-ending-open-elements-is-wrong";
205 case LXB_HTML_RULES_ERROR_OPELISWR: return "open-elements-is-wrong";
206 case LXB_HTML_RULES_ERROR_UNELINOPELST: return "unexpected-element-in-open-elements-stack";
207 case LXB_HTML_RULES_ERROR_MIELINOPELST: return "missing-element-in-open-elements-stack";
208 case LXB_HTML_RULES_ERROR_NOBOELINSC: return "no-body-element-in-scope";
209 case LXB_HTML_RULES_ERROR_MIELINSC: return "missing-element-in-scope";
210 case LXB_HTML_RULES_ERROR_UNELINSC: return "unexpected-element-in-scope";
211 case LXB_HTML_RULES_ERROR_UNELINACFOST: return "unexpected-element-in-active-formatting-stack";
212 case LXB_HTML_RULES_ERROR_UNENOFFI: return "unexpected-end-of-file";
213 case LXB_HTML_RULES_ERROR_CHINTATE: return "characters-in-table-text";
214 case LXB_HTML_RULES_ERROR_DOTOINTAMO: return "doctype-token-in-table-mode";
215 case LXB_HTML_RULES_ERROR_DOTOINSEMO: return "doctype-token-in-select-mode";
216 case LXB_HTML_RULES_ERROR_DOTOAFBOMO: return "doctype-token-after-body-mode";
217 case LXB_HTML_RULES_ERROR_DOTOINFRMO: return "doctype-token-in-frameset-mode";
218 case LXB_HTML_RULES_ERROR_DOTOAFFRMO: return "doctype-token-after-frameset-mode";
219 case LXB_HTML_RULES_ERROR_DOTOFOCOMO: return "doctype-token-foreign-content-mode";
220 default: return "unknown error";
221 }
222}
223
224static const char *dom_lexbor_libxml2_bridge_status_code_to_string(lexbor_libxml2_bridge_status status)
225{
226 switch (status) {
227 case LEXBOR_LIBXML2_BRIDGE_STATUS_CANNOT_INIT: return "cannot initialize data structures";
228 case LEXBOR_LIBXML2_BRIDGE_STATUS_FATAL_PARSE: return "fatal error in parsing";
229 case LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW: return "string length overflow";
230 case LEXBOR_LIBXML2_BRIDGE_STATUS_OOM: return "out of memory";
231 default: return "unknown error";
232 }
233}
234
235static void dom_reset_line_column_cache(dom_line_column_cache *cache)
236{
237 cache->last_line = 1;
238 cache->last_column = 1;
239 cache->last_offset = 0;
240}
241
242static void dom_find_line_and_column_using_cache(
243 const dom_lexbor_libxml2_bridge_application_data *application_data,
244 dom_line_column_cache *cache,
245 size_t offset
246)
247{
248 offset -= application_data->current_total_offset;
249 if (offset > application_data->current_input_length) {
250 /* Possible with empty input, also just good for general safety */
251 offset = application_data->current_input_length;
252 }
253
254 size_t last_column = cache->last_column;
255 size_t last_line = cache->last_line;
256 size_t last_offset = cache->last_offset;
257
258 /* Either unicode or UTF-8 data */
259 if (application_data->current_input_codepoints != NULL) {
260 while (last_offset < offset) {
261 if (application_data->current_input_codepoints[last_offset] == 0x000A /* Unicode codepoint for line feed */) {
262 last_line++;
263 last_column = 1;
264 } else {
265 last_column++;
266 }
267 last_offset++;
268 }
269 } else {
270 while (last_offset < offset) {
271 const lxb_char_t current = application_data->current_input_characters[last_offset];
272 if (current == '\n') {
273 last_line++;
274 last_column = 1;
275 last_offset++;
276 } else {
277 /* See Lexbor tokenizer patch
278 * Note for future self: branchlessly computing the length and jumping by the length would be nice,
279 * however it takes so many instructions to do so that it is slower than this naive method. */
280 if ((current & 0b11000000) != 0b10000000) {
281 last_column++;
282 }
283 last_offset++;
284 }
285 }
286 }
287
288 cache->last_column = last_column;
289 cache->last_line = last_line;
290 cache->last_offset = last_offset;
291}
292
293static void dom_lexbor_libxml2_bridge_tokenizer_error_reporter(
294 void *application_data_voidptr,
296 size_t offset
297)
298{
299 dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr;
300 dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, offset);
301 php_libxml_pretend_ctx_error_ex(application_data->input_name, application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column, "tokenizer error %s in %s, line: %zu, column: %zu\n", dom_lexbor_tokenizer_error_code_to_string(error->id), application_data->input_name, application_data->cache_tokenizer.last_line, application_data->cache_tokenizer.last_column);
302}
303
304static void dom_lexbor_libxml2_bridge_tree_error_reporter(
305 void *application_data_voidptr,
307 size_t line,
308 size_t column,
309 size_t len
310)
311{
312 dom_lexbor_libxml2_bridge_application_data *application_data = application_data_voidptr;
313
314 if (line == 1 && application_data->html_no_implied && error->id == LXB_HTML_RULES_ERROR_UNTOININMO) {
315 /* For no implied mode, we want to mimick libxml's behaviour of not reporting an error for a lacking doctype. */
316 return;
317 }
318
319 if (len <= 1) {
320 /* Possible with EOF, or single-character tokens, don't use a range in the error display in this case */
321 php_libxml_pretend_ctx_error_ex(
322 application_data->input_name,
323 line,
324 column,
325 "tree error %s in %s, line: %zu, column: %zu\n",
326 dom_lexbor_tree_error_code_to_string(error->id),
327 application_data->input_name,
328 line,
329 column
330 );
331 } else {
332 php_libxml_pretend_ctx_error_ex(
333 application_data->input_name,
334 line,
335 column,
336 "tree error %s in %s, line: %zu, column: %zu-%zu\n",
337 dom_lexbor_tree_error_code_to_string(error->id),
338 application_data->input_name,
339 line,
340 column,
341 column + len - 1
342 );
343 }
344}
345
346static xmlNodePtr dom_search_child(xmlNodePtr parent, const char *searching_for)
347{
348 xmlNodePtr node = parent->children;
349 while (node != NULL) {
350 if (node->type == XML_ELEMENT_NODE && strcmp((const char *) node->name, searching_for) == 0) {
351 return node;
352 }
353 node = node->next;
354 }
355 return NULL;
356}
357
358static void dom_place_remove_element_and_hoist_children(xmlNodePtr parent, const char *searching_for)
359{
360 xmlNodePtr node = dom_search_child(parent, searching_for);
361 if (node != NULL) {
362 xmlUnlinkNode(node);
363
364 xmlNodePtr child = node->children;
365 while (child != NULL) {
366 xmlUnlinkNode(child);
367 xmlAddChild(parent, child);
368 child = node->children;
369 }
370
371 xmlFreeNode(node);
372 }
373}
374
375static void dom_post_process_html5_loading(
376 xmlDocPtr lxml_doc,
379)
380{
381 if (options & HTML_PARSE_NOIMPLIED) {
382 xmlNodePtr html_node = dom_search_child((xmlNodePtr) lxml_doc, "html");
383 if (!observations->has_explicit_head_tag) {
384 dom_place_remove_element_and_hoist_children(html_node, "head");
385 }
386 if (!observations->has_explicit_body_tag) {
387 dom_place_remove_element_and_hoist_children(html_node, "body");
388 }
389 if (!observations->has_explicit_html_tag) {
390 dom_place_remove_element_and_hoist_children((xmlNodePtr) lxml_doc, "html");
391 }
392 }
393}
394
395/* https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding */
396static dom_character_encoding_data dom_determine_encoding(const char *source, size_t source_len)
397{
398 dom_character_encoding_data result;
399
400 /* BOM sniffing */
401 if (source_len >= 3 && source[0] == '\xEF' && source[1] == '\xBB' && source[2] == '\xBF') {
403 result.bom_shift = 3;
404 return result;
405 } else if (source_len >= 2) {
406 if (source[0] == '\xFE' && source[1] == '\xFF') {
408 result.bom_shift = 2;
409 return result;
410 } else if (source[0] == '\xFF' && source[1] == '\xFE') {
412 result.bom_shift = 2;
413 return result;
414 }
415 }
416
417 /* Perform prescan */
420 if (status != LXB_STATUS_OK) {
421 goto fallback_uninit;
422 }
423 /* This is the "wait either for 1024 bytes or 500ms" part */
424 if (source_len > 1024) {
425 source_len = 1024;
426 }
427 status = lxb_html_encoding_determine(&encoding, (const lxb_char_t *) source, (const lxb_char_t *) source + source_len);
428 if (status != LXB_STATUS_OK) {
429 goto fallback;
430 }
432 if (entry == NULL) {
433 goto fallback;
434 }
435 result.encoding_data = lxb_encoding_data_by_pre_name(entry->name, entry->end - entry->name);
436 if (!result.encoding_data) {
437 goto fallback;
438 }
439 result.bom_shift = 0;
441 return result;
442
443fallback:
445fallback_uninit:
446 result.encoding_data = lxb_encoding_data(DOM_FALLBACK_ENCODING_ID);
447 result.bom_shift = 0;
448 return result;
449}
450
451static void dom_setup_parser_encoding_manually(const lxb_char_t *buf_start, const lxb_encoding_data_t *encoding_data, dom_decoding_encoding_ctx *decoding_encoding_ctx, dom_lexbor_libxml2_bridge_application_data *application_data)
452{
453 static const lxb_codepoint_t replacement_codepoint = LXB_ENCODING_REPLACEMENT_CODEPOINT;
454
455 decoding_encoding_ctx->decode_data = encoding_data;
456
458 &decoding_encoding_ctx->decode,
459 decoding_encoding_ctx->decode_data,
460 decoding_encoding_ctx->codepoints,
461 sizeof(decoding_encoding_ctx->codepoints) / sizeof(*decoding_encoding_ctx->codepoints)
462 );
464 &decoding_encoding_ctx->decode,
465 &replacement_codepoint,
467 );
468 /* Note: encode_data is for UTF-8 */
469 decoding_encoding_ctx->fast_path = decoding_encoding_ctx->decode_data == decoding_encoding_ctx->encode_data;
470
471 if (decoding_encoding_ctx->fast_path) {
472 application_data->current_input_codepoints = NULL;
473 application_data->current_input_characters = (const char *) buf_start;
474 } else {
475 application_data->current_input_codepoints = decoding_encoding_ctx->codepoints;
476 application_data->current_input_characters = NULL;
477 }
478}
479
480static void dom_setup_parser_encoding_implicitly(
481 const lxb_char_t **buf_ref,
482 size_t *read,
483 dom_decoding_encoding_ctx *decoding_encoding_ctx,
484 dom_lexbor_libxml2_bridge_application_data *application_data
485)
486{
487 const char *buf_start = (const char *) *buf_ref;
488 dom_character_encoding_data dom_encoding_data = dom_determine_encoding(buf_start, *read);
489 *buf_ref += dom_encoding_data.bom_shift;
490 *read -= dom_encoding_data.bom_shift;
491 dom_setup_parser_encoding_manually((const lxb_char_t *) buf_start, dom_encoding_data.encoding_data, decoding_encoding_ctx, application_data);
492}
493
494static bool dom_process_parse_chunk(
496 lxb_html_document_t *document,
497 lxb_html_parser_t *parser,
498 size_t encoded_length,
499 const lxb_char_t *encoding_output,
500 size_t input_buffer_length,
501 size_t *tokenizer_error_offset,
502 size_t *tree_error_offset
503)
504{
505 dom_lexbor_libxml2_bridge_application_data *application_data = ctx->application_data;
506 application_data->current_input_length = input_buffer_length;
507 lexbor_status_t lexbor_status = lxb_html_document_parse_chunk(document, encoding_output, encoded_length);
508 if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
509 return false;
510 }
512 lexbor_libxml2_bridge_report_errors(ctx, parser, encoding_output, application_data->current_total_offset, tokenizer_error_offset, tree_error_offset);
513 dom_find_line_and_column_using_cache(application_data, &application_data->cache_tokenizer, application_data->current_total_offset + input_buffer_length);
514 }
515 application_data->current_total_offset += input_buffer_length;
516 application_data->cache_tokenizer.last_offset = 0;
517 return true;
518}
519
520static bool dom_decode_encode_fast_path(
522 lxb_html_document_t *document,
523 lxb_html_parser_t *parser,
524 const lxb_char_t **buf_ref_ref,
525 const lxb_char_t *buf_end,
526 dom_decoding_encoding_ctx *decoding_encoding_ctx,
527 size_t *tokenizer_error_offset,
528 size_t *tree_error_offset
529)
530{
531 const lxb_char_t *buf_ref = *buf_ref_ref;
532
533 /* If we returned for needing more bytes, we need to finish up the buffer for the old codepoint. */
534 if (decoding_encoding_ctx->decode.status == LXB_STATUS_CONTINUE) {
535 lxb_char_t buf[4];
536 lxb_char_t *buf_ptr = buf;
537 lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
538 if (lxb_encoding_encode_utf_8_single(&decoding_encoding_ctx->encode, &buf_ptr, buf + sizeof(buf), codepoint) > sizeof(buf)) {
540 }
541 decoding_encoding_ctx->decode.status = LXB_STATUS_OK;
542
543 if (!dom_process_parse_chunk(
544 ctx,
545 document,
546 parser,
547 buf_ptr - buf,
548 buf,
549 buf_ref - *buf_ref_ref,
550 tokenizer_error_offset,
551 tree_error_offset
552 )) {
553 goto fail_oom;
554 }
555 }
556
557 const lxb_char_t *last_output = buf_ref;
558 while (buf_ref != buf_end) {
559 /* Fast path converts non-validated UTF-8 -> validated UTF-8 */
560 if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) {
561 /* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we
562 * need more UTF-8 bytes to complete a sequence.
563 * It might be tempting to use SIMD here, but it turns out that this is less efficient because
564 * we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */
565 buf_ref++;
566 continue;
567 }
568 const lxb_char_t *buf_ref_backup = buf_ref;
569 lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
570 if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
571 size_t skip = buf_ref - buf_ref_backup; /* Skip invalid data, it's replaced by the UTF-8 replacement bytes */
572 if (!dom_process_parse_chunk(
573 ctx,
574 document,
575 parser,
576 buf_ref - last_output - skip,
577 last_output,
578 buf_ref - last_output,
579 tokenizer_error_offset,
580 tree_error_offset
581 )) {
582 goto fail_oom;
583 }
584
585 if (codepoint == LXB_ENCODING_DECODE_CONTINUE) {
586 ZEND_ASSERT(buf_ref == buf_end);
587 /* The decoder needs more data but the entire buffer is consumed.
588 * All valid data is outputted, and if the remaining data for the code point
589 * is invalid, the next call will output the replacement bytes. */
590 *buf_ref_ref = buf_ref;
591 decoding_encoding_ctx->decode.status = LXB_STATUS_CONTINUE;
592 return true;
593 }
594
595 if (!dom_process_parse_chunk(
596 ctx,
597 document,
598 parser,
601 0,
602 tokenizer_error_offset,
603 tree_error_offset
604 )) {
605 goto fail_oom;
606 }
607
608 last_output = buf_ref;
609 }
610 }
611 if (buf_ref != last_output
612 && !dom_process_parse_chunk(
613 ctx,
614 document,
615 parser,
616 buf_ref - last_output,
617 last_output,
618 buf_ref - last_output,
619 tokenizer_error_offset,
620 tree_error_offset
621 )) {
622 goto fail_oom;
623 }
624 *buf_ref_ref = buf_ref;
625 return true;
626fail_oom:
627 *buf_ref_ref = buf_ref;
628 return false;
629}
630
631static bool dom_decode_encode_slow_path(
633 lxb_html_document_t *document,
634 lxb_html_parser_t *parser,
635 const lxb_char_t **buf_ref_ref,
636 const lxb_char_t *buf_end,
637 dom_decoding_encoding_ctx *decoding_encoding_ctx,
638 size_t *tokenizer_error_offset,
639 size_t *tree_error_offset
640)
641{
642 const lxb_char_t *buf_ref = *buf_ref_ref;
643 lexbor_status_t decode_status, encode_status;
644 do {
645 decode_status = decoding_encoding_ctx->decode_data->decode(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
646
647 const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints;
648 size_t decoding_buffer_used = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode);
649 const lxb_codepoint_t *codepoints_end = decoding_encoding_ctx->codepoints + decoding_buffer_used;
650 do {
651 encode_status = decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end);
652 ZEND_ASSERT(encode_status != LXB_STATUS_ERROR && "parameters and replacements should be valid");
653 if (!dom_process_parse_chunk(
654 ctx,
655 document,
656 parser,
657 lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
658 decoding_encoding_ctx->encoding_output,
659 decoding_buffer_used,
660 tokenizer_error_offset,
661 tree_error_offset
662 )) {
663 goto fail_oom;
664 }
665 lxb_encoding_encode_buf_used_set(&decoding_encoding_ctx->encode, 0);
666 } while (encode_status == LXB_STATUS_SMALL_BUFFER);
667 lxb_encoding_decode_buf_used_set(&decoding_encoding_ctx->decode, 0);
668 } while (decode_status == LXB_STATUS_SMALL_BUFFER);
669 *buf_ref_ref = buf_ref;
670 return true;
671fail_oom:
672 *buf_ref_ref = buf_ref;
673 return false;
674}
675
676static bool dom_parse_decode_encode_step(
678 lxb_html_document_t *document,
679 lxb_html_parser_t *parser,
680 const lxb_char_t **buf_ref_ref,
681 const lxb_char_t *buf_end,
682 dom_decoding_encoding_ctx *decoding_encoding_ctx,
683 size_t *tokenizer_error_offset,
684 size_t *tree_error_offset
685)
686{
687 if (decoding_encoding_ctx->fast_path) {
688 return dom_decode_encode_fast_path(
689 ctx,
690 document,
691 parser,
692 buf_ref_ref,
693 buf_end,
694 decoding_encoding_ctx,
695 tokenizer_error_offset,
696 tree_error_offset
697 );
698 } else {
699 return dom_decode_encode_slow_path(
700 ctx,
701 document,
702 parser,
703 buf_ref_ref,
704 buf_end,
705 decoding_encoding_ctx,
706 tokenizer_error_offset,
707 tree_error_offset
708 );
709 }
710}
711
712static bool dom_parse_decode_encode_finish(
714 lxb_html_document_t *document,
715 lxb_html_parser_t *parser,
716 dom_decoding_encoding_ctx *decoding_encoding_ctx,
717 size_t *tokenizer_error_offset,
718 size_t *tree_error_offset
719)
720{
722
723 status = lxb_encoding_decode_finish(&decoding_encoding_ctx->decode);
725
726 size_t decoding_buffer_size = lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode);
727 if (decoding_buffer_size > 0) {
728 const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) decoding_encoding_ctx->codepoints;
729 const lxb_codepoint_t *codepoints_end = codepoints_ref + decoding_buffer_size;
730 status = decoding_encoding_ctx->encode_data->encode(&decoding_encoding_ctx->encode, &codepoints_ref, codepoints_end);
732 /* No need to produce output here, as we finish the encoder below and pass the chunk. */
733 }
734
735 status = lxb_encoding_encode_finish(&decoding_encoding_ctx->encode);
737 if (lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode)
738 && !dom_process_parse_chunk(
739 ctx,
740 document,
741 parser,
742 lxb_encoding_encode_buf_used(&decoding_encoding_ctx->encode),
743 decoding_encoding_ctx->encoding_output,
744 lxb_encoding_decode_buf_used(&decoding_encoding_ctx->decode),
745 tokenizer_error_offset,
746 tree_error_offset
747 )) {
748 return false;
749 }
750 return true;
751}
752
753static bool check_options_validity(uint32_t arg_num, zend_long options)
754{
755 const zend_long VALID_OPTIONS = XML_PARSE_NOERROR | XML_PARSE_COMPACT | HTML_PARSE_NOIMPLIED | DOM_HTML_NO_DEFAULT_NS;
756 if ((options & ~VALID_OPTIONS) != 0) {
757 zend_argument_value_error(arg_num, "contains invalid flags (allowed flags: "
758 "LIBXML_NOERROR, "
759 "LIBXML_COMPACT, "
760 "LIBXML_HTML_NOIMPLIED, "
761 "Dom\\HTML_NO_DEFAULT_NS)");
762 return false;
763 }
764 return true;
765}
766
767PHP_METHOD(Dom_HTMLDocument, createEmpty)
768{
769 const char *encoding = "UTF-8";
770 size_t encoding_len = strlen("UTF-8");
771 if (zend_parse_parameters(ZEND_NUM_ARGS(), "|p", &encoding, &encoding_len) == FAILURE) {
773 }
774
775 const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) encoding, encoding_len);
776
777 if (encoding_data == NULL) {
778 zend_argument_value_error(1, "must be a valid document encoding");
780 }
781
782 xmlDocPtr lxml_doc = php_dom_create_html_doc();
783 if (UNEXPECTED(lxml_doc == NULL)) {
784 goto oom;
785 }
786
787 lxml_doc->encoding = xmlStrdup((const xmlChar *) encoding);
788
792 (xmlNodePtr) lxml_doc,
793 NULL
794 );
797 return;
798
799oom:
802}
803
804/* Only bother to register error handling when the error reports can become observable. */
805static bool dom_should_register_error_handlers(zend_long options)
806{
807 if (options & XML_PARSE_NOERROR) {
808 return false;
809 }
810
811 return php_libxml_uses_internal_errors() || ((EG(error_reporting) | EG(user_error_handler_error_reporting)) & E_WARNING);
812}
813
814PHP_METHOD(Dom_HTMLDocument, createFromString)
815{
816 const char *source, *override_encoding = NULL;
817 size_t source_len, override_encoding_len;
818 zend_long options = 0;
821 "s|lp!",
822 &source,
823 &source_len,
824 &options,
825 &override_encoding,
826 &override_encoding_len
827 ) == FAILURE) {
829 }
830
831 if (!check_options_validity(2, options)) {
833 }
834
835 dom_lexbor_libxml2_bridge_application_data application_data;
836 application_data.input_name = "Entity";
837 application_data.current_total_offset = 0;
838 application_data.html_no_implied = options & HTML_PARSE_NOIMPLIED;
839 dom_reset_line_column_cache(&application_data.cache_tokenizer);
842 if (dom_should_register_error_handlers(options)) {
844 &ctx,
845 dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
846 dom_lexbor_libxml2_bridge_tree_error_reporter
847 );
848 }
849 ctx.application_data = &application_data;
850
851 size_t tokenizer_error_offset = 0;
852 size_t tree_error_offset = 0;
853
854 /* Setup everything encoding & decoding related */
855 const lxb_char_t *buf_ref = (const lxb_char_t *) source;
856 dom_decoding_encoding_ctx decoding_encoding_ctx;
857 dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
858 if (override_encoding != NULL) {
859 const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
860 (const lxb_char_t *) override_encoding,
861 override_encoding_len
862 );
863 if (!encoding_data) {
864 zend_argument_value_error(3, "must be a valid document encoding");
866 }
867 dom_setup_parser_encoding_manually(buf_ref, encoding_data, &decoding_encoding_ctx, &application_data);
868 } else {
869 dom_setup_parser_encoding_implicitly(&buf_ref, &source_len, &decoding_encoding_ctx, &application_data);
870 }
871
873 if (UNEXPECTED(document == NULL)) {
874 goto fail_oom;
875 }
876
877 lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document);
878 if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
879 goto fail_oom;
880 }
881
882 lxb_html_parser_t *parser = document->dom_document.parser;
883
884 while (source_len > 0) {
885 size_t chunk_size = source_len;
886 const size_t MAX_CHUNK_SIZE = sizeof(decoding_encoding_ctx.encoding_output) / sizeof(*decoding_encoding_ctx.encoding_output);
887 if (chunk_size > MAX_CHUNK_SIZE) {
888 chunk_size = MAX_CHUNK_SIZE;
889 }
890 source_len -= chunk_size;
891
892 const lxb_char_t *buf_end = buf_ref + chunk_size;
893 bool result = dom_parse_decode_encode_step(
894 &ctx,
895 document,
896 parser,
897 &buf_ref,
898 buf_end,
899 &decoding_encoding_ctx,
900 &tokenizer_error_offset,
901 &tree_error_offset
902 );
903 if (!result) {
904 goto fail_oom;
905 }
906
907 /* In the string case we have a single buffer that acts as a sliding window.
908 * The `current_input_characters` field starts pointing at the start of the buffer, but needs to slide along the
909 * sliding window as well. */
910 if (application_data.current_input_characters) {
911 application_data.current_input_characters += chunk_size;
912 }
913 }
914
915 if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
916 goto fail_oom;
917 }
918
919 lexbor_status = lxb_html_document_parse_chunk_end(document);
920 if (lexbor_status != LXB_STATUS_OK) {
921 goto fail_oom;
922 }
923
925
926 xmlDocPtr lxml_doc;
928 document,
929 &lxml_doc,
930 options & XML_PARSE_COMPACT,
932 private_data
933 );
935 if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) {
936 php_dom_private_data_destroy(private_data);
937 php_libxml_ctx_error(
938 NULL,
939 "%s in %s",
940 dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status),
941 application_data.input_name
942 );
945 }
947
948 dom_post_process_html5_loading(lxml_doc, options, &ctx.observations);
949
950 if (decoding_encoding_ctx.decode_data) {
951 lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name);
952 } else {
953 lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
954 }
955
959 (xmlNodePtr) lxml_doc,
960 NULL
961 );
963 intern->document->quirks_mode = ctx.observations.quirks_mode;
964 intern->document->private_data = php_dom_libxml_private_data_header(private_data);
965 return;
966
967fail_oom:
971}
972
973PHP_METHOD(Dom_HTMLDocument, createFromFile)
974{
975 const char *filename, *override_encoding = NULL;
976 php_dom_private_data *private_data = NULL;
977 size_t filename_len, override_encoding_len;
978 zend_long options = 0;
979 php_stream *stream = NULL;
982 "p|lp!",
983 &filename,
985 &options,
986 &override_encoding,
987 &override_encoding_len
988 ) == FAILURE) {
990 }
991
992 /* See php_libxml_streams_IO_open_wrapper(), apparently this caused issues in the past. */
993 if (strstr(filename, "%00")) {
994 zend_argument_value_error(1, "must not contain percent-encoded NUL bytes");
996 }
997
998 if (!check_options_validity(2, options)) {
1000 }
1001
1002 dom_lexbor_libxml2_bridge_application_data application_data;
1003 application_data.input_name = filename;
1004 application_data.current_total_offset = 0;
1005 application_data.html_no_implied = options & HTML_PARSE_NOIMPLIED;
1006 dom_reset_line_column_cache(&application_data.cache_tokenizer);
1009 if (dom_should_register_error_handlers(options)) {
1011 &ctx,
1012 dom_lexbor_libxml2_bridge_tokenizer_error_reporter,
1013 dom_lexbor_libxml2_bridge_tree_error_reporter
1014 );
1015 }
1016 ctx.application_data = &application_data;
1017
1018 char buf[4096];
1019
1020 /* Setup everything encoding & decoding related */
1021 dom_decoding_encoding_ctx decoding_encoding_ctx;
1022 dom_decoding_encoding_ctx_init(&decoding_encoding_ctx);
1023 bool should_determine_encoding_implicitly = true; /* First read => determine encoding implicitly */
1024 if (override_encoding != NULL) {
1025 const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
1026 (const lxb_char_t *) override_encoding,
1027 override_encoding_len
1028 );
1029 if (!encoding_data) {
1030 zend_argument_value_error(3, "must be a valid document encoding");
1031 RETURN_THROWS();
1032 }
1033 should_determine_encoding_implicitly = false;
1034 dom_setup_parser_encoding_manually((const lxb_char_t *) buf, encoding_data, &decoding_encoding_ctx, &application_data);
1035 }
1036
1037 zend_string *opened_path = NULL;
1038 stream = php_stream_open_wrapper_ex(filename, "rb", REPORT_ERRORS, &opened_path, php_libxml_get_stream_context());
1039 if (!stream) {
1040 if (!EG(exception)) {
1041 zend_throw_exception_ex(NULL, 0, "Cannot open file '%s'", filename);
1042 }
1043 RETURN_THROWS();
1044 }
1045
1046 /* MIME sniff */
1047 if (should_determine_encoding_implicitly) {
1048 zend_string *charset = php_libxml_sniff_charset_from_stream(stream);
1049 if (charset != NULL) {
1050 const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
1051 (const lxb_char_t *) ZSTR_VAL(charset),
1053 );
1054 if (encoding_data != NULL) {
1055 should_determine_encoding_implicitly = false;
1056 dom_setup_parser_encoding_manually(
1057 (const lxb_char_t *) buf,
1058 encoding_data,
1059 &decoding_encoding_ctx,
1060 &application_data
1061 );
1062 }
1064 }
1065 }
1066
1068 if (UNEXPECTED(document == NULL)) {
1069 goto fail_oom;
1070 }
1071
1072 lxb_status_t lexbor_status = lxb_html_document_parse_chunk_begin(document);
1073 if (UNEXPECTED(lexbor_status != LXB_STATUS_OK)) {
1074 goto fail_oom;
1075 }
1076
1077 size_t tokenizer_error_offset = 0;
1078 size_t tree_error_offset = 0;
1079 ssize_t read;
1080 lxb_html_parser_t *parser = document->dom_document.parser;
1081
1082 while ((read = php_stream_read(stream, buf, sizeof(buf))) > 0) {
1083 const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
1084
1085 if (should_determine_encoding_implicitly) {
1086 should_determine_encoding_implicitly = false;
1087 dom_setup_parser_encoding_implicitly(&buf_ref, (size_t *) &read, &decoding_encoding_ctx, &application_data);
1088 }
1089
1090 const lxb_char_t *buf_end = buf_ref + read;
1091 bool result = dom_parse_decode_encode_step(
1092 &ctx,
1093 document,
1094 parser,
1095 &buf_ref,
1096 buf_end,
1097 &decoding_encoding_ctx,
1098 &tokenizer_error_offset,
1099 &tree_error_offset
1100 );
1101 if (!result) {
1102 goto fail_oom;
1103 }
1104 }
1105
1106 if (!dom_parse_decode_encode_finish(&ctx, document, parser, &decoding_encoding_ctx, &tokenizer_error_offset, &tree_error_offset)) {
1107 goto fail_oom;
1108 }
1109
1110 lexbor_status = lxb_html_document_parse_chunk_end(document);
1111 if (lexbor_status != LXB_STATUS_OK) {
1112 goto fail_oom;
1113 }
1114
1115 private_data = php_dom_private_data_create();
1116
1117 xmlDocPtr lxml_doc;
1119 document,
1120 &lxml_doc,
1121 options & XML_PARSE_COMPACT,
1123 private_data
1124 );
1126 if (UNEXPECTED(bridge_status != LEXBOR_LIBXML2_BRIDGE_STATUS_OK)) {
1127 php_libxml_ctx_error(NULL, "%s in %s", dom_lexbor_libxml2_bridge_status_code_to_string(bridge_status), filename);
1129 goto fail_general;
1130 }
1131 lxb_html_document_destroy(document);
1132
1133 dom_post_process_html5_loading(lxml_doc, options, &ctx.observations);
1134
1135 if (decoding_encoding_ctx.decode_data) {
1136 lxml_doc->encoding = xmlStrdup((const xmlChar *) decoding_encoding_ctx.decode_data->name);
1137 } else {
1138 lxml_doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
1139 }
1140
1141 if (stream->wrapper == &php_plain_files_wrapper && opened_path != NULL) {
1142 xmlChar *converted = xmlPathToURI((const xmlChar *) ZSTR_VAL(opened_path));
1143 if (UNEXPECTED(!converted)) {
1144 goto fail_oom;
1145 }
1146 /* Check for "file:/" instead of "file://" because of libxml2 quirk */
1147 if (strncmp((const char *) converted, "file:/", sizeof("file:/") - 1) != 0) {
1148 xmlChar *buffer = xmlStrdup((const xmlChar *) "file://");
1149 if (UNEXPECTED(!buffer)) {
1150 xmlFree(converted);
1151 goto fail_oom;
1152 }
1153 xmlChar *new_buffer = xmlStrcat(buffer, converted);
1154 if (UNEXPECTED(!new_buffer)) {
1155 xmlFree(buffer);
1156 xmlFree(converted);
1157 goto fail_oom;
1158 }
1159 xmlFree(converted);
1160 lxml_doc->URL = new_buffer;
1161 } else {
1162#ifdef PHP_WIN32
1163 converted = php_dom_libxml_fix_file_path(converted);
1164#endif
1165 lxml_doc->URL = converted;
1166 }
1167 } else {
1168 lxml_doc->URL = xmlStrdup((const xmlChar *) filename);
1169 }
1170
1171 if (opened_path != NULL) {
1172 zend_string_release_ex(opened_path, false);
1173 }
1174 php_stream_close(stream);
1175 stream = NULL;
1176
1180 (xmlNodePtr) lxml_doc,
1181 NULL
1182 );
1183 dom_set_xml_class(intern->document);
1184 intern->document->quirks_mode = ctx.observations.quirks_mode;
1185 intern->document->private_data = php_dom_libxml_private_data_header(private_data);
1186 return;
1187
1188fail_oom:
1190fail_general:
1191 if (private_data != NULL) {
1192 php_dom_private_data_destroy(private_data);
1193 }
1194 lxb_html_document_destroy(document);
1195 php_stream_close(stream);
1196 if (opened_path != NULL) {
1197 zend_string_release_ex(opened_path, false);
1198 }
1199}
1200
1201static zend_result dom_write_output_smart_str(void *ctx, const char *buf, size_t size)
1202{
1203 smart_str_appendl((smart_str *) ctx, buf, size);
1204 return SUCCESS;
1205}
1206
1207static zend_result dom_write_output_stream(void *application_data, const char *buf, size_t len)
1208{
1209 php_stream *stream = (php_stream *) application_data;
1210 if (UNEXPECTED(php_stream_write(stream, buf, len) < 0)) {
1211 return FAILURE;
1212 }
1213 return SUCCESS;
1214}
1215
1216static zend_result dom_saveHTML_write_string_len(void *application_data, const char *buf, size_t len)
1217{
1218 dom_output_ctx *output = (dom_output_ctx *) application_data;
1219 lxb_status_t decode_status, encode_status;
1220 const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
1221 const lxb_char_t *buf_end = buf_ref + len;
1222
1223 do {
1224 decode_status = output->decoding_data->decode(output->decode, &buf_ref, buf_end);
1225
1226 const lxb_codepoint_t *codepoints_ref = output->codepoints;
1227 const lxb_codepoint_t *codepoints_end = codepoints_ref + lxb_encoding_decode_buf_used(output->decode);
1228 do {
1229 encode_status = output->encoding_data->encode(output->encode, &codepoints_ref, codepoints_end);
1230 if (UNEXPECTED(output->write_output(
1231 output->output_data,
1232 (const char *) output->encoding_output,
1233 lxb_encoding_encode_buf_used(output->encode)
1234 ) != SUCCESS)) {
1235 return FAILURE;
1236 }
1237 lxb_encoding_encode_buf_used_set(output->encode, 0);
1238 } while (encode_status == LXB_STATUS_SMALL_BUFFER);
1239 lxb_encoding_decode_buf_used_set(output->decode, 0);
1240 } while (decode_status == LXB_STATUS_SMALL_BUFFER);
1241
1242 return SUCCESS;
1243}
1244
1245static zend_result dom_saveHTML_write_string(void *application_data, const char *buf)
1246{
1247 return dom_saveHTML_write_string_len(application_data, buf, strlen(buf));
1248}
1249
1250static zend_result dom_common_save(dom_output_ctx *output_ctx, dom_object *intern, const xmlDoc *docp, const xmlNode *node)
1251{
1252 /* Initialize everything related to encoding & decoding */
1254 const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name(
1255 (const lxb_char_t *) docp->encoding,
1256 strlen((const char *) docp->encoding)
1257 );
1259 lxb_encoding_decode_t decode;
1260 lxb_char_t encoding_output[4096];
1261 lxb_codepoint_t codepoints[4096];
1262 (void) lxb_encoding_encode_init(&encode, encoding_data, encoding_output, sizeof(encoding_output) / sizeof(*encoding_output));
1263 (void) lxb_encoding_decode_init(&decode, decoding_data, codepoints, sizeof(codepoints) / sizeof(*codepoints));
1264 if (encoding_data->encoding == LXB_ENCODING_UTF_8) {
1266 } else {
1267 /* Fallback if there is no replacement by default */
1269 }
1271
1272 output_ctx->encoding_data = encoding_data;
1273 output_ctx->decoding_data = decoding_data;
1274 output_ctx->encode = &encode;
1275 output_ctx->decode = &decode;
1276 output_ctx->codepoints = codepoints;
1277 output_ctx->encoding_output = encoding_output;
1278
1280 ctx.write_string_len = dom_saveHTML_write_string_len;
1281 ctx.write_string = dom_saveHTML_write_string;
1282 ctx.application_data = output_ctx;
1283 ctx.private_data = php_dom_get_private_data(intern);
1284 if (UNEXPECTED(dom_html5_serialize_outer(&ctx, node) != SUCCESS)) {
1285 return FAILURE;
1286 }
1287
1289 if (lxb_encoding_decode_buf_used(&decode)) {
1290 const lxb_codepoint_t *codepoints_ref = (const lxb_codepoint_t *) codepoints;
1291 (void) encoding_data->encode(&encode, &codepoints_ref, codepoints_ref + lxb_encoding_decode_buf_used(&decode));
1292 if (UNEXPECTED(output_ctx->write_output(
1293 output_ctx->output_data,
1294 (const char *) encoding_output,
1296 )) {
1297 return FAILURE;
1298 }
1299 }
1302 if (UNEXPECTED(output_ctx->write_output(
1303 output_ctx->output_data,
1304 (const char *) encoding_output,
1306 )) {
1307 return FAILURE;
1308 }
1309 }
1310
1311 return SUCCESS;
1312}
1313
1314PHP_METHOD(Dom_HTMLDocument, saveHtmlFile)
1315{
1316 zval *id;
1317 xmlDoc *docp;
1318 size_t file_len;
1319 dom_object *intern;
1320 char *file;
1321
1322 id = ZEND_THIS;
1323 if (zend_parse_parameters(ZEND_NUM_ARGS(), "p", &file, &file_len) == FAILURE) {
1324 RETURN_THROWS();
1325 }
1326
1327 if (file_len == 0) {
1329 RETURN_THROWS();
1330 }
1331
1332 php_stream *stream = php_stream_open_wrapper_ex(file, "wb", REPORT_ERRORS, /* opened_path */ NULL, php_libxml_get_stream_context());
1333 if (!stream) {
1335 }
1336
1337 DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
1338
1339 dom_output_ctx output_ctx;
1340 output_ctx.output_data = stream;
1341 output_ctx.write_output = dom_write_output_stream;
1342 if (UNEXPECTED(dom_common_save(&output_ctx, intern, docp, (const xmlNode *) docp) != SUCCESS)) {
1343 php_stream_close(stream);
1345 }
1346
1347 zend_long bytes = php_stream_tell(stream);
1348 php_stream_close(stream);
1349
1350 RETURN_LONG(bytes);
1351}
1352
1353PHP_METHOD(Dom_HTMLDocument, saveHtml)
1354{
1355 zval *nodep = NULL;
1356 const xmlDoc *docp;
1357 const xmlNode *node;
1358 dom_object *intern, *nodeobj;
1359
1361 RETURN_THROWS();
1362 }
1363
1364 DOM_GET_OBJ(docp, ZEND_THIS, xmlDocPtr, intern);
1365
1366 if (nodep != NULL) {
1367 DOM_GET_OBJ(node, nodep, xmlNodePtr, nodeobj);
1368 if (node->doc != docp) {
1370 RETURN_THROWS();
1371 }
1372 } else {
1373 node = (const xmlNode *) docp;
1374 }
1375
1376 smart_str buf = {0};
1377 dom_output_ctx output_ctx;
1378 output_ctx.output_data = &buf;
1379 output_ctx.write_output = dom_write_output_smart_str;
1380 /* Can't fail because dom_write_output_smart_str() can't fail. */
1381 zend_result result = dom_common_save(&output_ctx, intern, docp, node);
1383
1384 RETURN_STR(smart_str_extract(&buf));
1385}
1386
1388{
1389 DOM_PROP_NODE(xmlDocPtr, docp, obj);
1390
1391 /* Typed property, can only be IS_STRING. */
1392 ZEND_ASSERT(Z_TYPE_P(newval) == IS_STRING);
1393
1394 zend_string *str = Z_STR_P(newval);
1395 const lxb_encoding_data_t *encoding_data = lxb_encoding_data_by_name((const lxb_char_t *) ZSTR_VAL(str), ZSTR_LEN(str));
1396
1397 if (encoding_data != NULL) {
1398 xmlFree(BAD_CAST docp->encoding);
1399 docp->encoding = xmlStrdup((const xmlChar *) encoding_data->name);
1400 } else {
1401 zend_value_error("Invalid document encoding");
1402 return FAILURE;
1403 }
1404
1405 return SUCCESS;
1406}
1407
1408static xmlNodePtr dom_html_document_element_read_raw(const xmlDoc *docp, bool (*accept)(const xmlChar *))
1409{
1410 const xmlNode *root = xmlDocGetRootElement(docp);
1411 if (root == NULL || !(php_dom_ns_is_fast(root, php_dom_ns_is_html_magic_token) && xmlStrEqual(root->name, BAD_CAST "html"))) {
1412 return NULL;
1413 }
1414
1415 xmlNodePtr cur = root->children;
1416 while (cur != NULL) {
1417 if (cur->type == XML_ELEMENT_NODE && php_dom_ns_is_fast(cur, php_dom_ns_is_html_magic_token) && accept(cur->name)) {
1418 return cur;
1419 }
1420 cur = cur->next;
1421 }
1422
1423 return NULL;
1424}
1425
1426zend_result dom_html_document_element_read_helper(dom_object *obj, zval *retval, bool (*accept)(const xmlChar *))
1427{
1428 DOM_PROP_NODE(const xmlDoc *, docp, obj);
1429
1430 const xmlNode *element = dom_html_document_element_read_raw(docp, accept);
1431 php_dom_create_nullable_object((xmlNodePtr) element, retval, obj);
1432
1433 return SUCCESS;
1434}
1435
1436static bool dom_accept_body_name(const xmlChar *name)
1437{
1438 return xmlStrEqual(name, BAD_CAST "body") || xmlStrEqual(name, BAD_CAST "frameset");
1439}
1440
1441static bool dom_accept_head_name(const xmlChar *name)
1442{
1443 return xmlStrEqual(name, BAD_CAST "head");
1444}
1445
1446/* https://html.spec.whatwg.org/#dom-document-body */
1448{
1449 return dom_html_document_element_read_helper(obj, retval, dom_accept_body_name);
1450}
1451
1452/* https://html.spec.whatwg.org/#dom-document-head */
1454{
1455 return dom_html_document_element_read_helper(obj, retval, dom_accept_head_name);
1456}
1457
1458/* https://html.spec.whatwg.org/#dom-document-body */
1460{
1461 DOM_PROP_NODE(xmlDocPtr, docp, obj);
1462
1463 /* 1. If the new value is not a body or frameset element, then throw a "HierarchyRequestError" DOMException. */
1464 if (Z_TYPE_P(newval) != IS_NULL) {
1465 dom_object *newval_intern = Z_DOMOBJ_P(newval);
1466 if (newval_intern->ptr != NULL) {
1467 xmlNodePtr newval_node = ((php_libxml_node_ptr *) newval_intern->ptr)->node;
1468 /* Note: because this property has type HTMLElement, we know the namespace is correct. */
1469 if (dom_accept_body_name(newval_node->name)) {
1470 /* 2. If the new value is the same as the body element, return. */
1471 const xmlNode *current_body_element = dom_html_document_element_read_raw(docp, dom_accept_body_name);
1472 if (current_body_element == newval_node) {
1473 return SUCCESS;
1474 }
1475
1476 /* 3. If the body element is not null, then replace the body element with the new value within the body element's parent and return. */
1477 if (current_body_element != NULL) {
1478 php_dom_adopt_node(newval_node, obj, docp);
1479 xmlNodePtr old = xmlReplaceNode((xmlNodePtr) current_body_element, newval_node);
1480 if (old != NULL && old->_private == NULL) {
1481 php_libxml_node_free_resource(old);
1482 }
1483 return SUCCESS;
1484 }
1485
1486 /* 4. If there is no document element, throw a "HierarchyRequestError" DOMException. */
1487 xmlNodePtr root = xmlDocGetRootElement(docp);
1488 if (root == NULL) {
1489 php_dom_throw_error_with_message(HIERARCHY_REQUEST_ERR, "A body can only be set if there is a document element", true);
1490 return FAILURE;
1491 }
1492
1493 /* 5. Append the new value to the document element. */
1494 php_dom_adopt_node(newval_node, obj, docp);
1495 xmlAddChild(root, newval_node);
1496 return SUCCESS;
1497 }
1498 }
1499 }
1500
1501 php_dom_throw_error_with_message(HIERARCHY_REQUEST_ERR, "The new body must either be a body or a frameset tag", true);
1502 return FAILURE;
1503}
1504
1505/* https://dom.spec.whatwg.org/#concept-child-text-content */
1506static zend_string *dom_get_child_text_content(const xmlNode *node)
1507{
1508 smart_str content = {0};
1509
1510 const xmlNode *text = node->children;
1511 while (text != NULL) {
1512 if ((text->type == XML_TEXT_NODE || text->type == XML_CDATA_SECTION_NODE) && text->content != NULL) {
1513 smart_str_appends(&content, (const char *) text->content);
1514 }
1515 text = text->next;
1516 }
1517
1518 return smart_str_extract(&content);
1519}
1520
1521/* https://html.spec.whatwg.org/#the-title-element-2 */
1522static xmlNodePtr dom_get_title_element(const xmlDoc *doc)
1523{
1524 xmlNodePtr node = doc->children;
1525
1526 while (node != NULL) {
1527 if (node->type == XML_ELEMENT_NODE) {
1528 if (php_dom_ns_is_fast(node, php_dom_ns_is_html_magic_token) && xmlStrEqual(node->name, BAD_CAST "title")) {
1529 break;
1530 }
1531 }
1532
1533 node = php_dom_next_in_tree_order(node, NULL);
1534 }
1535
1536 return node;
1537}
1538
1539/* The subtle difference is that this is about the direct title descendant of the svg element,
1540 * whereas the html variant of this function is about the first in-tree title element. */
1541static xmlNodePtr dom_get_svg_title_element(xmlNodePtr svg)
1542{
1543 xmlNodePtr cur = svg->children;
1544
1545 while (cur != NULL) {
1546 if (cur->type == XML_ELEMENT_NODE
1547 && php_dom_ns_is_fast(cur, php_dom_ns_is_svg_magic_token) && xmlStrEqual(cur->name, BAD_CAST "title")) {
1548 break;
1549 }
1550 cur = cur->next;
1551 }
1552
1553 return cur;
1554}
1555
1556/* https://html.spec.whatwg.org/#document.title */
1558{
1559 DOM_PROP_NODE(const xmlDoc *, docp, obj);
1560 xmlNodePtr root = xmlDocGetRootElement(docp);
1561
1562 if (root == NULL) {
1564 return SUCCESS;
1565 }
1566
1568
1569 /* 1. If the document element is an SVG svg element,
1570 * then let value be the child text content of the first SVG title element that is a child of the document element. */
1571 if (php_dom_ns_is_fast(root, php_dom_ns_is_svg_magic_token) && xmlStrEqual(root->name, BAD_CAST "svg")) {
1572 const xmlNode *title = dom_get_svg_title_element(root);
1573 if (title != NULL) {
1574 value = dom_get_child_text_content(title);
1575 }
1576 } else {
1577 /* 2. Otherwise, let value be the child text content of the title element,
1578 * or the empty string if the title element is null. */
1579 const xmlNode *title = dom_get_title_element(docp);
1580 if (title != NULL) {
1581 value = dom_get_child_text_content(title);
1582 }
1583 }
1584
1585 /* 3. Strip and collapse ASCII whitespace in value. */
1587
1588 /* 4. Return value. */
1590
1591 return SUCCESS;
1592}
1593
1594static void dom_string_replace_all(xmlDocPtr docp, xmlNodePtr element, zval *zv)
1595{
1596 dom_remove_all_children(element);
1597 xmlNode *text = xmlNewDocText(docp, BAD_CAST Z_STRVAL_P(zv));
1598 xmlAddChild(element, text);
1599}
1600
1601/* https://html.spec.whatwg.org/#document.title */
1603{
1604 DOM_PROP_NODE(xmlDocPtr, docp, obj);
1605 xmlNodePtr root = xmlDocGetRootElement(docp);
1606
1607 if (root == NULL) {
1608 return SUCCESS;
1609 }
1610
1611 /* If the document element is an SVG svg element */
1612 if (php_dom_ns_is_fast(root, php_dom_ns_is_svg_magic_token) && xmlStrEqual(root->name, BAD_CAST "svg")) {
1613 /* 1. If there is an SVG title element that is a child of the document element, let element be the first such element. */
1614 xmlNodePtr element = dom_get_svg_title_element(root);
1615
1616 /* 2. Otherwise: */
1617 if (element == NULL) {
1618 /* 2.1. Let element be the result of creating an element given the document element's node document,
1619 * title, and the SVG namespace. */
1620
1621 /* Annoyingly, we must create it in the svg namespace _without_ prefix... */
1622 xmlNsPtr ns = root->ns;
1623 if (ns->prefix != NULL) {
1624 /* Slow path... */
1628 zend_string_release_ex(href, false);
1629 }
1630
1631 element = xmlNewDocNode(docp, ns, BAD_CAST "title", NULL);
1632 if (UNEXPECTED(element == NULL)) {
1634 return FAILURE;
1635 }
1636
1637 /* 2.2. Insert element as the first child of the document element. */
1638 if (root->children == NULL) {
1639 root->last = element;
1640 } else {
1641 element->next = root->children;
1642 root->children->prev = element;
1643 }
1644 root->children = element;
1645 element->parent = root;
1646 }
1647
1648 /* 3. String replace all with the given value within element. */
1649 dom_string_replace_all(docp, element, newval);
1650 }
1651 /* If the document element is in the HTML namespace */
1653 /* 1. If the title element is null and the head element is null, then return. */
1654 xmlNodePtr title = dom_get_title_element(docp);
1655 xmlNodePtr head = dom_html_document_element_read_raw(docp, dom_accept_head_name);
1656 if (title == NULL && head == NULL) {
1657 return SUCCESS;
1658 }
1659
1660 /* 2. If the title element is non-null, let element be the title element. */
1661 xmlNodePtr element = title;
1662
1663 /* 3. Otherwise: */
1664 if (element == NULL) {
1665 /* 3.1. Let element be the result of creating an element given the document element's node document, title,
1666 * and the HTML namespace. */
1668 element = xmlNewDocNode(docp, php_dom_libxml_ns_mapper_ensure_html_ns(ns_mapper), BAD_CAST "title", NULL);
1669 if (UNEXPECTED(element == NULL)) {
1671 return FAILURE;
1672 }
1673
1674 /* 3.2. Append element to the head element. */
1675 xmlAddChild(head, element);
1676 }
1677
1678 /* 4. String replace all with the given value within element. */
1679 dom_string_replace_all(docp, element, newval);
1680 }
1681
1682 return SUCCESS;
1683}
1684
1685#if ZEND_DEBUG
1686PHP_METHOD(Dom_HTMLDocument, debugGetTemplateCount)
1687{
1688 xmlDocPtr doc;
1689 dom_object *intern;
1690
1692
1693 DOM_GET_OBJ(doc, ZEND_THIS, xmlDocPtr, intern);
1694 ZEND_IGNORE_VALUE(doc);
1695
1697}
1698#endif
1699
1700#endif /* HAVE_LIBXML && HAVE_DOM */
size_t len
Definition apprentice.c:174
bool exception
Definition assert.c:30
file(string $filename, int $flags=0, $context=null)
strstr(string $haystack, string $needle, bool $before_needle=false)
lexbor_status_t
Definition base.h:48
@ LXB_STATUS_SMALL_BUFFER
Definition base.h:64
@ LXB_STATUS_CONTINUE
Definition base.h:63
@ LXB_STATUS_OK
Definition base.h:49
@ LXB_STATUS_ERROR
Definition base.h:50
lxb_codepoint_t lxb_encoding_decode_utf_8_single(lxb_encoding_decode_t *ctx, const lxb_char_t **data, const lxb_char_t *end)
Definition decode.c:2780
DNS_STATUS status
Definition dns_win32.c:49
const lxb_encoding_data_t * lxb_encoding_data_by_pre_name(const lxb_char_t *name, size_t length)
Definition encoding.c:11
lxb_status_t lxb_html_encoding_init(lxb_html_encoding_t *em)
Definition encoding.c:67
lxb_html_encoding_t * lxb_html_encoding_destroy(lxb_html_encoding_t *em, bool self_destroy)
Definition encoding.c:86
lxb_status_t lxb_html_encoding_determine(lxb_html_encoding_t *em, const lxb_char_t *data, const lxb_char_t *end)
Definition encoding.c:103
PHP_DOM_EXPORT zend_class_entry * dom_modern_node_class_entry
PHP_DOM_EXPORT zend_class_entry * dom_html_document_class_entry
PHP_DOM_EXPORT zend_class_entry * dom_abstract_base_document_class_entry
zend_result dom_html_document_encoding_write(dom_object *obj, zval *retval)
zend_result dom_modern_document_implementation_read(dom_object *obj, zval *retval)
zend_result dom_html_document_body_write(dom_object *obj, zval *newval)
zend_result dom_html_document_title_write(dom_object *obj, zval *newval)
zend_result dom_html_document_head_read(dom_object *obj, zval *retval)
zend_result dom_html_document_body_read(dom_object *obj, zval *retval)
zend_result dom_html_document_title_read(dom_object *obj, zval *retval)
#define DOM_PROP_NODE(type, name, obj)
void php_dom_throw_error_with_message(dom_exception_code error_code, const char *error_message, bool strict_error)
void php_dom_throw_error(dom_exception_code error_code, bool strict_error)
@ INVALID_STATE_ERR
@ HIERARCHY_REQUEST_ERR
@ WRONG_DOCUMENT_ERR
int8_t lxb_encoding_encode_utf_8_single(lxb_encoding_encode_t *ctx, lxb_char_t **data, const lxb_char_t *end, lxb_codepoint_t cp)
Definition encode.c:1705
@ LXB_ENCODING_DECODE_CONTINUE
Definition base.h:54
#define LXB_ENCODING_REPLACEMENT_BYTES
Definition base.h:28
@ LXB_ENCODING_REPLACEMENT_SIZE
Definition base.h:39
@ LXB_ENCODING_MAX_CODEPOINT
Definition base.h:41
@ LXB_ENCODING_REPLACEMENT_CODEPOINT
Definition base.h:40
#define LXB_ENCODING_REPLACEMENT_BUFFER
Definition base.h:31
struct lxb_encoding_data lxb_encoding_data_t
Definition base.h:103
#define LXB_ENCODING_REPLACEMENT_BUFFER_LEN
Definition base.h:30
@ LXB_ENCODING_UTF_16BE
Definition const.h:43
@ LXB_ENCODING_UTF_16LE
Definition const.h:44
@ LXB_ENCODING_UTF_8
Definition const.h:45
lxb_inline lxb_status_t lxb_encoding_encode_finish(lxb_encoding_encode_t *encode)
Definition encoding.h:57
lxb_inline lxb_status_t lxb_encoding_encode_init(lxb_encoding_encode_t *encode, const lxb_encoding_data_t *encoding_data, lxb_char_t *buffer_out, size_t buffer_length)
Definition encoding.h:39
lxb_inline const lxb_encoding_data_t * lxb_encoding_data_by_name(const lxb_char_t *name, size_t length)
Definition encoding.h:297
lxb_inline const lxb_encoding_data_t * lxb_encoding_data(lxb_encoding_t encoding)
Definition encoding.h:315
lxb_inline lxb_status_t lxb_encoding_decode_replace_set(lxb_encoding_decode_t *decode, const lxb_codepoint_t *replace, size_t length)
Definition encoding.h:216
lxb_inline lxb_status_t lxb_encoding_decode_init(lxb_encoding_decode_t *decode, const lxb_encoding_data_t *encoding_data, lxb_codepoint_t *buffer_out, size_t buffer_length)
Definition encoding.h:143
lxb_inline void lxb_encoding_decode_buf_used_set(lxb_encoding_decode_t *decode, size_t buffer_used)
Definition encoding.h:203
lxb_inline size_t lxb_encoding_decode_buf_used(lxb_encoding_decode_t *decode)
Definition encoding.h:210
lxb_inline lxb_status_t lxb_encoding_encode_replace_set(lxb_encoding_encode_t *encode, const lxb_char_t *replace, size_t length)
Definition encoding.h:95
lxb_inline lxb_status_t lxb_encoding_decode_finish(lxb_encoding_decode_t *decode)
Definition encoding.h:161
lxb_inline size_t lxb_encoding_encode_buf_used(lxb_encoding_encode_t *encode)
Definition encoding.h:89
lxb_inline void lxb_encoding_encode_buf_used_set(lxb_encoding_encode_t *encode, size_t buffer_used)
Definition encoding.h:82
error($message)
Definition ext_skel.php:22
zval * zv
Definition ffi.c:3975
new_type size
Definition ffi.c:4365
zend_ffi_ctype_name_buf buf
Definition ffi.c:4685
zend_long offset
size_t filename_len
#define NULL
Definition gdcache.h:45
#define SUCCESS
Definition hash_sha3.c:261
void lexbor_libxml2_bridge_parse_set_error_callbacks(lexbor_libxml2_bridge_parse_context *ctx, lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter, lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter)
lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_document(lxb_html_document_t *document, xmlDocPtr *doc_out, bool compact_text_nodes, bool create_default_ns, php_dom_private_data *private_data)
lexbor_libxml2_bridge_status
@ LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW
@ LEXBOR_LIBXML2_BRIDGE_STATUS_FATAL_PARSE
@ LEXBOR_LIBXML2_BRIDGE_STATUS_OOM
@ LEXBOR_LIBXML2_BRIDGE_STATUS_OK
@ LEXBOR_LIBXML2_BRIDGE_STATUS_CANNOT_INIT
void lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t *tree, lexbor_libxml2_bridge_extracted_observations *observations)
void lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context *ctx)
void lexbor_libxml2_bridge_report_errors(const lexbor_libxml2_bridge_parse_context *ctx, lxb_html_parser_t *parser, const lxb_char_t *input_html, size_t chunk_offset, size_t *error_index_offset_tokenizer, size_t *error_index_offset_tree)
zend_result dom_html5_serialize_outer(dom_html5_serialize_context *ctx, const xmlNode *node)
lxb_inline lxb_html_encoding_entry_t * lxb_html_encoding_meta_entry(lxb_html_encoding_t *em, size_t idx)
Definition encoding.h:66
struct lxb_html_document lxb_html_document_t
Definition interface.h:95
lxb_html_tokenizer_error_id_t
Definition error.h:20
@ LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA
Definition error.h:108
@ LXB_HTML_TOKENIZER_ERROR_SUININST
Definition error.h:102
@ LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA
Definition error.h:60
@ LXB_HTML_TOKENIZER_ERROR_MIDOPUID
Definition error.h:68
@ LXB_HTML_TOKENIZER_ERROR_MIDOSYID
Definition error.h:70
@ LXB_HTML_TOKENIZER_ERROR_INCLCO
Definition error.h:56
@ LXB_HTML_TOKENIZER_ERROR_ABDOPUID
Definition error.h:24
@ LXB_HTML_TOKENIZER_ERROR_NECO
Definition error.h:90
@ LXB_HTML_TOKENIZER_ERROR_MIATVA
Definition error.h:64
@ LXB_HTML_TOKENIZER_ERROR_UNNACHRE
Definition error.h:118
@ LXB_HTML_TOKENIZER_ERROR_MIWHBEDONA
Definition error.h:84
@ LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA
Definition error.h:110
@ LXB_HTML_TOKENIZER_ERROR_SUCHRE
Definition error.h:100
@ LXB_HTML_TOKENIZER_ERROR_ABCLOFEMCO
Definition error.h:22
@ LXB_HTML_TOKENIZER_ERROR_COCHRE
Definition error.h:36
@ LXB_HTML_TOKENIZER_ERROR_MIWHAFDOPUKE
Definition error.h:80
@ LXB_HTML_TOKENIZER_ERROR_MIWHAFDOSYKE
Definition error.h:82
@ LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE
Definition error.h:78
@ LXB_HTML_TOKENIZER_ERROR_UNCHINATNA
Definition error.h:106
@ LXB_HTML_TOKENIZER_ERROR_ENTAWIAT
Definition error.h:38
@ LXB_HTML_TOKENIZER_ERROR_EOINTA
Definition error.h:54
@ LXB_HTML_TOKENIZER_ERROR_ENTAWITRSO
Definition error.h:42
@ LXB_HTML_TOKENIZER_ERROR_UNNUCH
Definition error.h:112
@ LXB_HTML_TOKENIZER_ERROR_UNCHAFDOSYID
Definition error.h:104
@ LXB_HTML_TOKENIZER_ERROR_EOINCO
Definition error.h:48
@ LXB_HTML_TOKENIZER_ERROR_EOINSCHTCOLITE
Definition error.h:52
@ LXB_HTML_TOKENIZER_ERROR_MIDONA
Definition error.h:66
@ LXB_HTML_TOKENIZER_ERROR_DUAT
Definition error.h:40
@ LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA
Definition error.h:62
@ LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE
Definition error.h:28
@ LXB_HTML_TOKENIZER_ERROR_INOPCO
Definition error.h:58
@ LXB_HTML_TOKENIZER_ERROR_MIENTANA
Definition error.h:72
@ LXB_HTML_TOKENIZER_ERROR_NOININST
Definition error.h:94
@ LXB_HTML_TOKENIZER_ERROR_EOBETANA
Definition error.h:44
@ LXB_HTML_TOKENIZER_ERROR_MIQUBEDOPUID
Definition error.h:74
@ LXB_HTML_TOKENIZER_ERROR_COCHININST
Definition error.h:34
@ LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA
Definition error.h:32
@ LXB_HTML_TOKENIZER_ERROR_MIWHBEAT
Definition error.h:86
@ LXB_HTML_TOKENIZER_ERROR_EOINCD
Definition error.h:46
@ LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID
Definition error.h:76
@ LXB_HTML_TOKENIZER_ERROR_CDINHTCO
Definition error.h:30
@ LXB_HTML_TOKENIZER_ERROR_MIWHBEDOPUANSYID
Definition error.h:88
@ LXB_HTML_TOKENIZER_ERROR_UNSOINTA
Definition error.h:116
@ LXB_HTML_TOKENIZER_ERROR_ABDOSYID
Definition error.h:26
@ LXB_HTML_TOKENIZER_ERROR_NOCHRE
Definition error.h:92
@ LXB_HTML_TOKENIZER_ERROR_EOINDO
Definition error.h:50
@ LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA
Definition error.h:114
@ LXB_HTML_TOKENIZER_ERROR_NUCHRE
Definition error.h:98
@ LXB_HTML_TOKENIZER_ERROR_NOVOHTELSTTAWITRSO
Definition error.h:96
lxb_html_tree_error_id_t
Definition error.h:20
@ LXB_HTML_RULES_ERROR_DOTOAFHEMO
Definition error.h:56
@ LXB_HTML_RULES_ERROR_TECLTOWIOPINHEMO
Definition error.h:50
@ LXB_HTML_RULES_ERROR_NUCH
Definition error.h:26
@ LXB_HTML_RULES_ERROR_NOVOHTELSTTAWITRSO
Definition error.h:44
@ LXB_HTML_RULES_ERROR_CHINTATE
Definition error.h:80
@ LXB_HTML_RULES_ERROR_DOTOINFRMO
Definition error.h:88
@ LXB_HTML_RULES_ERROR_UNCLTO
Definition error.h:24
@ LXB_HTML_RULES_ERROR_DOTOINBEHEMO
Definition error.h:38
@ LXB_HTML_RULES_ERROR_UNELINOPELST
Definition error.h:66
@ LXB_HTML_RULES_ERROR_UNTO
Definition error.h:22
@ LXB_HTML_RULES_ERROR_MIELINOPELST
Definition error.h:68
@ LXB_HTML_RULES_ERROR_DOTOAFBOMO
Definition error.h:86
@ LXB_HTML_RULES_ERROR_MIELINSC
Definition error.h:72
@ LXB_HTML_RULES_ERROR_NOBOELINSC
Definition error.h:70
@ LXB_HTML_RULES_ERROR_BADOTOININMO
Definition error.h:32
@ LXB_HTML_RULES_ERROR_TEELISNOCUINHEMO
Definition error.h:52
@ LXB_HTML_RULES_ERROR_UNCHTO
Definition error.h:28
@ LXB_HTML_RULES_ERROR_DOTOAFFRMO
Definition error.h:90
@ LXB_HTML_RULES_ERROR_DOTOINBOMO
Definition error.h:60
@ LXB_HTML_RULES_ERROR_UNTOININMO
Definition error.h:30
@ LXB_HTML_RULES_ERROR_BAENOPELISWR
Definition error.h:62
@ LXB_HTML_RULES_ERROR_DOTOFOCOMO
Definition error.h:92
@ LXB_HTML_RULES_ERROR_DOTOINSEMO
Definition error.h:84
@ LXB_HTML_RULES_ERROR_DOTOINTAMO
Definition error.h:82
@ LXB_HTML_RULES_ERROR_UNCLTOINBEHEMO
Definition error.h:40
@ LXB_HTML_RULES_ERROR_DOTOINHEMO
Definition error.h:42
@ LXB_HTML_RULES_ERROR_UNELINACFOST
Definition error.h:76
@ LXB_HTML_RULES_ERROR_DOTOINBEHTMO
Definition error.h:34
@ LXB_HTML_RULES_ERROR_UNELINSC
Definition error.h:74
@ LXB_HTML_RULES_ERROR_HETOAFHEMO
Definition error.h:58
@ LXB_HTML_RULES_ERROR_HETOINHEMO
Definition error.h:46
@ LXB_HTML_RULES_ERROR_UNCLTOINHEMO
Definition error.h:48
@ LXB_HTML_RULES_ERROR_UNENOFFI
Definition error.h:78
@ LXB_HTML_RULES_ERROR_UNCLTOINBEHTMO
Definition error.h:36
@ LXB_HTML_RULES_ERROR_DOTOINHENOMO
Definition error.h:54
@ LXB_HTML_RULES_ERROR_OPELISWR
Definition error.h:64
enum entity_charset charset
Definition html_tables.h:39
zend_string * dom_strip_and_collapse_ascii_whitespace(zend_string *input)
lxb_html_document_t * lxb_html_document_create(void)
Definition document.c:189
lxb_status_t lxb_html_document_parse_chunk_begin(lxb_html_document_t *document)
Definition document.c:770
lxb_status_t lxb_html_document_parse_chunk(lxb_html_document_t *document, const lxb_char_t *html, size_t size)
Definition document.c:788
lxb_status_t lxb_html_document_parse_chunk_end(lxb_html_document_t *document)
Definition document.c:796
lxb_html_document_t * lxb_html_document_destroy(lxb_html_document_t *document)
Definition document.c:721
PHP_DOM_EXPORT const php_dom_ns_magic_token * php_dom_ns_is_html_magic_token
PHP_DOM_EXPORT xmlNsPtr php_dom_libxml_ns_mapper_ensure_html_ns(php_dom_libxml_ns_mapper *mapper)
PHP_DOM_EXPORT bool php_dom_ns_is_fast(const xmlNode *nodep, const php_dom_ns_magic_token *magic_token)
PHP_DOM_EXPORT xmlNsPtr php_dom_libxml_ns_mapper_get_ns(php_dom_libxml_ns_mapper *mapper, zend_string *prefix, zend_string *uri)
PHP_DOM_EXPORT const php_dom_ns_magic_token * php_dom_ns_is_svg_magic_token
PHP_DOM_EXPORT php_dom_libxml_ns_mapper * php_dom_get_ns_mapper(dom_object *object)
#define DOM_SVG_NS_URI
#define PHP_METHOD
Definition php.h:365
dom_object * php_dom_instantiate_object_helper(zval *return_value, zend_class_entry *ce, xmlNodePtr obj, dom_object *parent)
void php_dom_create_implementation(zval *retval, bool modern)
xmlDocPtr php_dom_create_html_doc(void)
#define DOM_HTML_NO_DEFAULT_NS
Definition php_dom.h:130
void dom_set_xml_class(php_libxml_ref_obj *document)
bool php_dom_adopt_node(xmlNodePtr nodep, dom_object *dom_object_new_document, xmlDocPtr new_document)
#define DOM_GET_OBJ(__ptr, __id, __prtype, __intern)
Definition php_dom.h:237
bool php_dom_create_nullable_object(xmlNodePtr obj, zval *return_value, dom_object *domobj)
xmlChar * php_dom_libxml_fix_file_path(xmlChar *path)
void dom_remove_all_children(xmlNodePtr nodep)
const XML_TEXT_NODE
const XML_ELEMENT_NODE
const XML_CDATA_SECTION_NODE
int line
Definition php_ffi.h:54
unsigned const char * text
Definition php_ffi.h:53
PHP_JSON_API size_t int options
Definition php_json.h:102
struct php_pcntl_pending_signal * head
Definition php_pcntl.h:47
xmlCharEncodingHandlerPtr encoding
Definition php_soap.h:170
struct _encode encode
Definition php_soap.h:42
PHPAPI php_stream_wrapper php_plain_files_wrapper
struct _php_stream php_stream
Definition php_streams.h:96
#define REPORT_ERRORS
#define php_stream_read(stream, buf, count)
#define php_stream_open_wrapper_ex(path, mode, options, opened, context)
#define php_stream_close(stream)
#define php_stream_tell(stream)
#define php_stream_write(stream, buf, count)
uint32_t last_line
Definition phpdbg.h:237
void php_dom_private_data_destroy(php_dom_private_data *data)
php_dom_private_data * php_dom_private_data_create(void)
uint32_t php_dom_get_template_count(const php_dom_private_data *private_data)
php_libxml_private_data_header * php_dom_libxml_private_data_header(php_dom_private_data *private_data)
zval * current
Definition session.c:1024
void * ptr
Definition xml_common.h:26
zend_object std
Definition xml_common.h:29
php_libxml_ref_obj * document
Definition xml_common.h:27
php_stream_wrapper * wrapper
Definition file.h:177
zend_result(* write_string_len)(void *application_data, const char *buf, size_t len)
php_dom_private_data * private_data
zend_result(* write_string)(void *application_data, const char *buf)
lexbor_libxml2_bridge_extracted_observations observations
lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter
lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter
lxb_char_t * name
Definition base.h:195
lxb_encoding_encode_f encode
Definition base.h:191
lxb_encoding_t encoding
Definition base.h:190
lxb_dom_document_t dom_document
Definition document.h:58
Definition encoding.h:19
const lxb_char_t * name
Definition encoding.h:20
const lxb_char_t * end
Definition encoding.h:21
lxb_html_tree_t * tree
Definition parser.h:32
unsigned int lxb_status_t
Definition types.h:28
unsigned char lxb_char_t
Definition types.h:27
uint32_t lxb_codepoint_t
Definition types.h:26
struct _dom_object dom_object
#define Z_DOMOBJ_P(zv)
Definition xml_common.h:36
ZEND_API ZEND_COLD void zend_value_error(const char *format,...)
Definition zend.c:1849
ZEND_API zend_result zend_parse_parameters(uint32_t num_args, const char *type_spec,...)
Definition zend_API.c:1300
ZEND_API ZEND_COLD void zend_argument_must_not_be_empty_error(uint32_t arg_num)
Definition zend_API.c:443
ZEND_API ZEND_COLD void zend_argument_value_error(uint32_t arg_num, const char *format,...)
Definition zend_API.c:433
#define ZEND_NUM_ARGS()
Definition zend_API.h:530
#define RETURN_FALSE
Definition zend_API.h:1058
#define ZEND_PARSE_PARAMETERS_NONE()
Definition zend_API.h:1623
#define RETURN_LONG(l)
Definition zend_API.h:1037
#define RETURN_THROWS()
Definition zend_API.h:1060
#define RETURN_STR(s)
Definition zend_API.h:1039
#define ZEND_THIS
Definition zend_API.h:523
#define RETVAL_FALSE
Definition zend_API.h:1032
#define ZVAL_EMPTY_STRING(z)
Definition zend_API.h:961
struct _zval_struct zval
error_reporting(?int $error_level=null)
strlen(string $string)
strncmp(string $string1, string $string2, int $length)
strcmp(string $string1, string $string2)
zend_string_release_ex(func->internal_function.function_name, 0)
#define OBJ_PROP_TO_NUM(offset)
#define OBJ_PROP_NUM(obj, num)
struct _zend_property_info zend_property_info
#define E_WARNING
Definition zend_errors.h:24
ZEND_API ZEND_COLD zend_object * zend_throw_exception_ex(zend_class_entry *exception_ce, zend_long code, const char *format,...)
ZEND_API void(ZEND_FASTCALL *zend_touch_vm_stack_data)(void *vm_stack_data)
#define EG(v)
int32_t zend_long
Definition zend_long.h:42
struct _zend_string zend_string
ZEND_API zend_property_info * zend_get_property_info(const zend_class_entry *ce, zend_string *member, int silent)
#define ZEND_IGNORE_VALUE(x)
#define ZEND_ASSERT(c)
#define UNEXPECTED(condition)
ZEND_API zend_string * zend_empty_string
Definition zend_string.c:51
#define ZSTR_VAL(zstr)
Definition zend_string.h:68
#define ZSTR_INIT_LITERAL(s, persistent)
#define ZSTR_LEN(zstr)
Definition zend_string.h:69
#define Z_TYPE_P(zval_p)
Definition zend_types.h:660
#define ZVAL_STR(z, s)
#define Z_STRVAL_P(zval_p)
Definition zend_types.h:975
#define Z_ISUNDEF_P(zval_p)
Definition zend_types.h:957
#define IS_STRING
Definition zend_types.h:606
#define Z_OBJ_P(zval_p)
Definition zend_types.h:990
#define Z_STR_P(zval_p)
Definition zend_types.h:972
#define IS_NULL
Definition zend_types.h:601
@ FAILURE
Definition zend_types.h:61
#define ZVAL_OBJ_COPY(z, o)
ZEND_RESULT_CODE zend_result
Definition zend_types.h:64
zval retval
zval * return_value
zend_property_info * prop_info
uint32_t arg_num
zend_string * name
bool result
value