22#if defined(HAVE_LIBXML) && defined(HAVE_DOM)
30#include <libxml/parserInternals.h>
31#include <libxml/HTMLtree.h>
33#define WORK_LIST_INIT_SIZE 128
35#define LXML_INTERNED_STRINGS_SIZE (sizeof(void *) * 2)
37typedef struct work_list_item {
39 uintptr_t current_active_namespace;
40 xmlNodePtr lxml_parent;
44static void lexbor_libxml2_bridge_work_list_item_push(
47 uintptr_t current_active_namespace,
48 xmlNodePtr lxml_parent,
54 item->current_active_namespace = current_active_namespace;
55 item->lxml_parent = lxml_parent;
56 item->lxml_ns = lxml_ns;
59static unsigned short sanitize_line_nr(
size_t line)
61 if (
line > USHRT_MAX) {
64 return (
unsigned short)
line;
73static struct lxml_ns get_libxml_namespace_href(uintptr_t lexbor_namespace)
84static zend_always_inline xmlNodePtr lexbor_libxml2_bridge_new_text_node_fast(xmlDocPtr lxml_doc,
const lxb_char_t *
data,
size_t data_length,
bool compact_text_nodes)
86 if (compact_text_nodes && data_length < LXML_INTERNED_STRINGS_SIZE) {
88 xmlNodePtr lxml_text = xmlMalloc(
sizeof(*lxml_text));
92 memset(lxml_text, 0,
sizeof(*lxml_text));
93 lxml_text->name = xmlStringText;
95 lxml_text->doc = lxml_doc;
96 lxml_text->content = BAD_CAST &lxml_text->properties;
102 return xmlNewDocTextLen(lxml_doc, (
const xmlChar *)
data, data_length);
110 bool compact_text_nodes,
111 bool create_default_ns,
119 xmlNsPtr xlink_ns =
NULL;
120 xmlNsPtr prefixed_xmlns_ns =
NULL;
126 lexbor_libxml2_bridge_work_list_item_push(&work_list, node,
LXB_NS__UNDEF, root,
NULL);
129 work_list_item *current_stack_item;
132 xmlNodePtr lxml_parent = current_stack_item->lxml_parent;
144 xmlNodePtr lxml_element = xmlNewDocNode(lxml_doc,
NULL,
name,
NULL);
149 xmlAddChild(lxml_parent, lxml_element);
150 lxml_element->line = sanitize_line_nr(node->
line);
153 uintptr_t entering_namespace = element->
node.
ns;
154 xmlNsPtr current_lxml_ns = current_stack_item->lxml_ns;
155 if (create_default_ns &&
UNEXPECTED(entering_namespace != current_stack_item->current_active_namespace)) {
157 current_lxml_ns = html_ns;
159 struct lxml_ns ns = get_libxml_namespace_href(entering_namespace);
160 zend_string *uri = zend_string_init(ns.href, ns.href_len,
false);
164 current_lxml_ns->_private = (
void *) ns.token;
169 lxml_element->ns = current_lxml_ns;
173 xmlNodePtr lxml_child_parent = lxml_element;
176 if (create_default_ns) {
177 lxml_child_parent = xmlNewDocFragment(lxml_doc);
183 lxml_child_parent->parent = lxml_element;
189 if (template->content !=
NULL) {
190 child_node =
template->content->node.
last_child;
194 for (; child_node !=
NULL; child_node = child_node->
prev) {
195 lexbor_libxml2_bridge_work_list_item_push(
204 xmlAttrPtr last_added_attr =
NULL;
207 size_t local_name_length, value_length;
209 if (
attr->node.prefix) {
210 const char *
pos =
strchr((
const char *) local_name,
':');
222 xmlAttrPtr lxml_attr = xmlMalloc(
sizeof(xmlAttr));
228 memset(lxml_attr, 0,
sizeof(xmlAttr));
230 lxml_attr->parent = lxml_element;
231 lxml_attr->name = xmlDictLookup(lxml_doc->dict, local_name, local_name_length);
232 lxml_attr->doc = lxml_doc;
233 xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc,
value, value_length,
true );
235 xmlFreeProp(lxml_attr);
240 lxml_attr->children = lxml_attr->last = lxml_text;
241 lxml_text->parent = (xmlNodePtr) lxml_attr;
244 if (
strcmp((
const char *) local_name,
"xmlns") != 0) {
245 if (prefixed_xmlns_ns ==
NULL) {
248 lxml_attr->ns = prefixed_xmlns_ns;
254 if (xlink_ns ==
NULL) {
258 lxml_attr->ns = xlink_ns;
261 if (last_added_attr ==
NULL) {
262 lxml_element->properties = lxml_attr;
264 last_added_attr->next = lxml_attr;
265 lxml_attr->prev = last_added_attr;
267 last_added_attr = lxml_attr;
270 if (local_name_length == 2 && local_name[0] ==
'i' && local_name[1] ==
'd' &&
attr->node.ns ==
LXB_NS_HTML) {
271 xmlAddID(
NULL, lxml_doc,
value, lxml_attr);
279 size_t data_length =
text->char_data.data.length;
284 xmlNodePtr lxml_text = lexbor_libxml2_bridge_new_text_node_fast(lxml_doc,
data, data_length, compact_text_nodes);
289 xmlAddChild(lxml_parent, lxml_text);
290 if (node->
line >= USHRT_MAX) {
291 lxml_text->line = USHRT_MAX;
294 lxml_text->line = (
unsigned short) node->
line;
299 size_t public_id_len, system_id_len;
302 xmlDtdPtr lxml_dtd = xmlCreateIntSubset(
305 public_id_len ? public_id :
NULL,
306 system_id_len ? system_id :
NULL
315 xmlNodePtr lxml_comment = xmlNewDocComment(lxml_doc, comment->
char_data.
data.
data);
320 xmlAddChild(lxml_parent, lxml_comment);
321 lxml_comment->line = sanitize_line_nr(node->
line);
331 memset(ctx, 0,
sizeof(*ctx));
347 bool compact_text_nodes,
348 bool create_default_ns,
359 (xmlNodePtr) lxml_doc,
365 xmlFreeDoc(lxml_doc);
375 xmlNodePtr *fragment_out,
376 bool compact_text_nodes,
377 bool create_default_ns,
381 xmlNodePtr fragment = xmlNewDocFragment(lxml_doc);
394 xmlFreeNode(fragment);
397 *fragment_out = fragment;
406 size_t *error_index_offset_tokenizer,
407 size_t *error_index_offset_tree
414 size_t index = *error_index_offset_tokenizer;
422 token_error->
pos - input_html + chunk_offset
427 *error_index_offset_tokenizer = index;
431 index = *error_index_offset_tree;
439 tree_error->
line + 1,
446 *error_index_offset_tree = index;
451 switch (quirks_mode) {
void * lexbor_array_obj_pop(lexbor_array_obj_t *array)
void * lexbor_array_obj_push_wo_cls(lexbor_array_obj_t *array)
lxb_status_t lexbor_array_obj_init(lexbor_array_obj_t *array, size_t size, size_t struct_size)
lexbor_array_obj_t * lexbor_array_obj_destroy(lexbor_array_obj_t *array, bool self_destroy)
lxb_inline void * lexbor_array_obj_get(const lexbor_array_obj_t *array, size_t idx)
lxb_inline const lxb_char_t * lxb_dom_attr_value(lxb_dom_attr_t *attr, size_t *len)
strchr(string $haystack, string $needle, bool $before_needle=false)
lxb_inline const lxb_char_t * lxb_dom_document_type_system_id(lxb_dom_document_type_t *doc_type, size_t *len)
lxb_inline const lxb_char_t * lxb_dom_document_type_public_id(lxb_dom_document_type_t *doc_type, size_t *len)
lxb_inline const lxb_char_t * lxb_dom_document_type_name(lxb_dom_document_type_t *doc_type, size_t *len)
struct lxb_dom_document_type lxb_dom_document_type_t
#define lxb_dom_interface_element(obj)
#define lxb_dom_interface_node(obj)
struct lxb_dom_comment lxb_dom_comment_t
#define lxb_dom_interface_document(obj)
struct lxb_dom_node lxb_dom_node_t
#define lxb_dom_interface_comment(obj)
#define lxb_dom_interface_document_type(obj)
struct lxb_dom_attr lxb_dom_attr_t
struct lxb_dom_text lxb_dom_text_t
struct lxb_dom_element lxb_dom_element_t
#define lxb_dom_interface_text(obj)
@ LXB_DOM_DOCUMENT_CMODE_QUIRKS
@ LXB_DOM_DOCUMENT_CMODE_LIMITED_QUIRKS
@ LXB_DOM_DOCUMENT_CMODE_NO_QUIRKS
@ LXB_DOM_NODE_TYPE_COMMENT
@ LXB_DOM_NODE_TYPE_DOCUMENT_TYPE
@ LXB_DOM_NODE_TYPE_ELEMENT
memset(ptr, 0, type->size)
void(* lexbor_libxml2_bridge_tokenizer_error_reporter)(void *application_data, lxb_html_tokenizer_error_t *error, size_t offset)
void lexbor_libxml2_bridge_parse_set_error_callbacks(lexbor_libxml2_bridge_parse_context *ctx, lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter, lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter)
lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_fragment(lxb_dom_node_t *start_node, xmlDocPtr lxml_doc, xmlNodePtr *fragment_out, bool compact_text_nodes, bool create_default_ns, php_dom_private_data *private_data)
lexbor_libxml2_bridge_status lexbor_libxml2_bridge_convert_document(lxb_html_document_t *document, xmlDocPtr *doc_out, bool compact_text_nodes, bool create_default_ns, php_dom_private_data *private_data)
lexbor_libxml2_bridge_status
@ LEXBOR_LIBXML2_BRIDGE_STATUS_OVERFLOW
@ LEXBOR_LIBXML2_BRIDGE_STATUS_OOM
@ LEXBOR_LIBXML2_BRIDGE_STATUS_OK
void lexbor_libxml2_bridge_copy_observations(lxb_html_tree_t *tree, lexbor_libxml2_bridge_extracted_observations *observations)
void lexbor_libxml2_bridge_parse_context_init(lexbor_libxml2_bridge_parse_context *ctx)
void lexbor_libxml2_bridge_report_errors(const lexbor_libxml2_bridge_parse_context *ctx, lxb_html_parser_t *parser, const lxb_char_t *input_html, size_t chunk_offset, size_t *error_index_offset_tokenizer, size_t *error_index_offset_tree)
void(* lexbor_libxml2_bridge_tree_error_reporter)(void *application_data, lxb_html_tree_error_t *error, size_t line, size_t column, size_t len)
struct lxb_html_tree lxb_html_tree_t
struct lxb_html_template_element lxb_html_template_element_t
#define lxb_html_interface_template(obj)
struct lxb_html_document lxb_html_document_t
lxb_inline lxb_html_tokenizer_t * lxb_html_parser_tokenizer(lxb_html_parser_t *parser)
lxb_inline lxb_html_tree_t * lxb_html_parser_tree(lxb_html_parser_t *parser)
const lxb_char_t * lxb_dom_attr_qualified_name(lxb_dom_attr_t *attr, size_t *len)
const lxb_char_t * lxb_dom_element_qualified_name(lxb_dom_element_t *element, size_t *len)
PHP_DOM_EXPORT const php_dom_ns_magic_token * php_dom_ns_is_html_magic_token
PHP_DOM_EXPORT const php_dom_ns_magic_token * php_dom_ns_is_xmlns_magic_token
PHP_DOM_EXPORT xmlNsPtr php_dom_libxml_ns_mapper_ensure_html_ns(php_dom_libxml_ns_mapper *mapper)
PHP_DOM_EXPORT const php_dom_ns_magic_token * php_dom_ns_is_xlink_magic_token
struct php_dom_ns_magic_token php_dom_ns_magic_token
PHP_DOM_EXPORT xmlNsPtr php_dom_libxml_ns_mapper_ensure_prefixless_xmlns_ns(php_dom_libxml_ns_mapper *mapper)
PHP_DOM_EXPORT xmlNsPtr php_dom_libxml_ns_mapper_get_ns(php_dom_libxml_ns_mapper *mapper, zend_string *prefix, zend_string *uri)
PHP_DOM_EXPORT const php_dom_ns_magic_token * php_dom_ns_is_svg_magic_token
#define DOM_MATHML_NS_URI
PHP_DOM_EXPORT const php_dom_ns_magic_token * php_dom_ns_is_mathml_magic_token
PHP_DOM_EXPORT xmlNsPtr php_dom_libxml_ns_mapper_get_ns_raw_strings_nullsafe(php_dom_libxml_ns_mapper *mapper, const char *prefix, const char *uri)
xmlDocPtr php_dom_create_html_doc(void)
unsigned const char * pos
unsigned const char * text
void php_dom_add_templated_content(php_dom_private_data *private_data, const xmlNode *template_node, xmlNodePtr fragment)
php_dom_libxml_ns_mapper * php_dom_ns_mapper_from_private(php_dom_private_data *private_data)
void dom_add_element_ns_hook(php_dom_private_data *private_data, xmlNodePtr element)
lexbor_libxml2_bridge_tree_error_reporter tree_error_reporter
lexbor_libxml2_bridge_tokenizer_error_reporter tokenizer_error_reporter
lxb_dom_attr_t * first_attr
lxb_dom_node_t * last_child
lexbor_array_obj_t * parse_errors
bool has_explicit_html_tag
bool has_explicit_head_tag
lexbor_array_obj_t * parse_errors
lxb_html_document_t * document
bool has_explicit_body_tag
lxb_inline bool lxb_html_tree_node_is(lxb_dom_node_t *node, lxb_tag_id_t tag_id)
strcmp(string $string1, string $string2)
zend_string_release_ex(func->internal_function.function_name, 0)
struct _zend_string zend_string
#define EXPECTED(condition)
#define zend_always_inline
#define EMPTY_SWITCH_DEFAULT_CASE()
#define UNEXPECTED(condition)