11#define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
12#define LEXBOR_STR_RES_ALPHANUMERIC_CHARACTER
13#define LEXBOR_STR_RES_REPLACEMENT_CHARACTER
14#define LEXBOR_STR_RES_ALPHA_CHARACTER
15#define LEXBOR_STR_RES_MAP_HEX
16#define LEXBOR_STR_RES_MAP_NUM
20#define LXB_HTML_TOKENIZER_RES_ENTITIES_SBST
197lxb_html_tokenizer_state_to_ascii_utf_8(
size_t codepoint,
lxb_char_t *
data);
208 if (tkz->
is_eof ==
false) {
216 tkz->
state = lxb_html_tokenizer_state_data;
239 tkz->
state = lxb_html_tokenizer_state_tag_open;
325 if (tkz->
is_eof ==
false) {
331 tkz->
state = lxb_html_tokenizer_state_plaintext;
414 tkz->
state = lxb_html_tokenizer_state_tag_name;
423 else if (*
data == 0x2F) {
424 tkz->
state = lxb_html_tokenizer_state_end_tag_open;
430 else if (*
data == 0x21) {
431 tkz->
state = lxb_html_tokenizer_state_markup_declaration_open;
439 else if (*
data == 0x3F) {
440 tkz->
state = lxb_html_tokenizer_state_bogus_comment_before;
452 else if (*
data == 0x00) {
471 tkz->
state = lxb_html_tokenizer_state_data;
486 tkz->
state = lxb_html_tokenizer_state_tag_name;
497 else if (*
data == 0x3E) {
498 tkz->
state = lxb_html_tokenizer_state_data;
507 else if (*
data == 0x00) {
521 tkz->
state = lxb_html_tokenizer_state_bogus_comment_before;
649 tkz->
state = lxb_html_tokenizer_state_attribute_name;
658 tkz->
state = lxb_html_tokenizer_state_after_attribute_name;
664 tkz->
state = lxb_html_tokenizer_state_after_attribute_name;
674 tkz->
state = lxb_html_tokenizer_state_attribute_name;
716 tkz->
state = lxb_html_tokenizer_state_after_attribute_name;
727 tkz->
state = lxb_html_tokenizer_state_after_attribute_name;
745 tkz->
state = lxb_html_tokenizer_state_before_attribute_value;
805 tkz->
state = lxb_html_tokenizer_state_before_attribute_value;
828 tkz->
state = lxb_html_tokenizer_state_attribute_name;
865 lxb_html_tokenizer_state_attribute_value_double_quoted;
872 lxb_html_tokenizer_state_attribute_value_single_quoted;
888 tkz->
state = lxb_html_tokenizer_state_attribute_value_unquoted;
923 lxb_html_tokenizer_state_after_attribute_value_quoted;
931 tkz->
state = lxb_html_tokenizer_state_char_ref_attr;
932 tkz->
state_return = lxb_html_tokenizer_state_attribute_value_double_quoted;
942 tkz->
state_return = lxb_html_tokenizer_state_attribute_value_double_quoted;
1017 lxb_html_tokenizer_state_after_attribute_value_quoted;
1025 tkz->
state = lxb_html_tokenizer_state_char_ref_attr;
1026 tkz->
state_return = lxb_html_tokenizer_state_attribute_value_single_quoted;
1036 tkz->
state_return = lxb_html_tokenizer_state_attribute_value_single_quoted;
1042 tkz->
pos[-1] = 0x0A;
1046 if (*
data != 0x0A) {
1127 tkz->
state = lxb_html_tokenizer_state_char_ref_attr;
1128 tkz->
state_return = lxb_html_tokenizer_state_attribute_value_unquoted;
1262 if (*
data == 0x0A) {
1320 tkz->
state = lxb_html_tokenizer_state_bogus_comment;
1354 tkz->
state_return = lxb_html_tokenizer_state_bogus_comment;
1360 tkz->
pos[-1] = 0x0A;
1364 if (*
data != 0x0A) {
1414 if (tkz->
is_eof ==
false) {
1419 if (*
data == 0x2D) {
1421 tkz->
state = lxb_html_tokenizer_state_markup_declaration_comment;
1425 if (
data[1] == 0x2D) {
1434 else if (*
data == 0x44 || *
data == 0x64) {
1438 tkz->
state = lxb_html_tokenizer_state_markup_declaration_doctype;
1451 else if (*
data == 0x5B) {
1455 tkz->
state = lxb_html_tokenizer_state_markup_declaration_cdata;
1467 tkz->
state = lxb_html_tokenizer_state_cdata_section_before;
1472 tkz->
state = lxb_html_tokenizer_state_bogus_comment_before;
1487 tkz->
state = lxb_html_tokenizer_state_bogus_comment_before;
1502 if (*
data == 0x2D) {
1510 tkz->
state = lxb_html_tokenizer_state_bogus_comment_before;
1530 tkz->
state = lxb_html_tokenizer_state_bogus_comment_before;
1562 tkz->
state = lxb_html_tokenizer_state_bogus_comment_before;
1572 tkz->
state = lxb_html_tokenizer_state_cdata_section_before;
1578 tkz->
state = lxb_html_tokenizer_state_bogus_comment_before;
1595 if (tkz->
is_eof ==
false) {
1604 tkz->
state = lxb_html_tokenizer_state_cdata_section;
1626 tkz->
state = lxb_html_tokenizer_state_cdata_section_bracket;
1635 tkz->
state_return = lxb_html_tokenizer_state_cdata_section;
1641 tkz->
pos[-1] = 0x0A;
1645 if (*
data != 0x0A) {
1699 if (*
data == 0x5D) {
1700 tkz->
state = lxb_html_tokenizer_state_cdata_section_end;
1706 tkz->
state = lxb_html_tokenizer_state_cdata_section;
1720 if (*
data == 0x5D) {
1725 else if (*
data == 0x3E) {
1736 tkz->
state = lxb_html_tokenizer_state_cdata_section;
1750 return _lxb_html_tokenizer_state_char_ref(tkz,
data,
end);
1760 return _lxb_html_tokenizer_state_char_ref(tkz,
data,
end);
1770 tkz->
entity = &lxb_html_tokenizer_res_entities_sbst[1];
1774 tkz->
state = lxb_html_tokenizer_state_char_ref_named;
1779 else if (*
data == 0x23) {
1785 tkz->
state = lxb_html_tokenizer_state_char_ref_numeric;
1809 size_t size, tail_size;
1818 if (entry ==
NULL) {
1823 if (entry->
value[0] != 0) {
1828 entry = &lxb_html_tokenizer_res_entities_sbst[ entry->
next ];
1843 tkz->
state = lxb_html_tokenizer_state_char_ref_ambiguous_ampersand;
1878 if (tail_size != 0) {
1887 tkz->
pos - tail_size, tail_size);
1909 if (*
data == 0x3B) {
1933 if (*
data == 0x78 || *
data == 0x58) {
1936 tkz->
state = lxb_html_tokenizer_state_char_ref_hexademical_start;
1941 tkz->
state = lxb_html_tokenizer_state_char_ref_decimal_start;
1956 tkz->
state = lxb_html_tokenizer_state_char_ref_hexademical;
1978 tkz->
state = lxb_html_tokenizer_state_char_ref_decimal;
2006 return lxb_html_tokenizer_state_char_ref_numeric_end(tkz,
data,
end);
2036 return lxb_html_tokenizer_state_char_ref_numeric_end(tkz,
data,
end);
2092 case 0xFFFE:
case 0xFFFF:
case 0x1FFFE:
case 0x1FFFF:
case 0x2FFFE:
2093 case 0x2FFFF:
case 0x3FFFE:
case 0x3FFFF:
case 0x4FFFE:
case 0x4FFFF:
2094 case 0x5FFFE:
case 0x5FFFF:
case 0x6FFFE:
case 0x6FFFF:
case 0x7FFFE:
2095 case 0x7FFFF:
case 0x8FFFE:
case 0x8FFFF:
case 0x9FFFE:
case 0x9FFFF:
2096 case 0xAFFFE:
case 0xAFFFF:
case 0xBFFFE:
case 0xBFFFF:
case 0xCFFFE:
2097 case 0xCFFFF:
case 0xDFFFE:
case 0xDFFFF:
case 0xEFFFE:
case 0xEFFFF:
2098 case 0xFFFFE:
case 0xFFFFF:
2132 memcpy(
start, lexbor_str_res_ansi_replacement_character,
2133 sizeof(lexbor_str_res_ansi_replacement_character) - 1);
2135 tkz->
pos =
start +
sizeof(lexbor_str_res_ansi_replacement_character) - 1;
2141lxb_html_tokenizer_state_to_ascii_utf_8(
size_t codepoint,
lxb_char_t *
data)
2148 if (codepoint <= 0x0000007F) {
2150 data[0] = (char) codepoint;
2154 else if (codepoint <= 0x000007FF) {
2156 data[0] = (char) (0xC0 | (codepoint >> 6 ));
2157 data[1] = (char) (0x80 | (codepoint & 0x3F));
2161 else if (codepoint <= 0x0000FFFF) {
2163 data[0] = (char) (0xE0 | ((codepoint >> 12)));
2164 data[1] = (char) (0x80 | ((codepoint >> 6 ) & 0x3F));
2165 data[2] = (char) (0x80 | ( codepoint & 0x3F));
2169 else if (codepoint <= 0x001FFFFF) {
2171 data[0] = (char) (0xF0 | ( codepoint >> 18));
2172 data[1] = (char) (0x80 | ((codepoint >> 12) & 0x3F));
2173 data[2] = (char) (0x80 | ((codepoint >> 6 ) & 0x3F));
2174 data[3] = (char) (0x80 | ( codepoint & 0x3F));
@ LXB_STATUS_ERROR_OVERFLOW
lxb_ns_id_t lxb_html_tokenizer_current_namespace(lxb_html_tokenizer_t *tkz)
struct lexbor_hash lexbor_hash_t
hash(string $algo, string $data, bool $binary=false, array $options=[])
struct lxb_html_tokenizer lxb_html_tokenizer_t
@ LXB_HTML_TOKEN_TYPE_CLOSE
@ LXB_HTML_TOKEN_TYPE_CLOSE_SELF
lxb_html_tokenizer_error_t * lxb_html_tokenizer_error_add(lexbor_array_obj_t *parse_errors, const lxb_char_t *pos, lxb_html_tokenizer_error_id_t id)
@ LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA
@ LXB_HTML_TOKENIZER_ERROR_MIATVA
@ LXB_HTML_TOKENIZER_ERROR_UNNACHRE
@ LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA
@ LXB_HTML_TOKENIZER_ERROR_SUCHRE
@ LXB_HTML_TOKENIZER_ERROR_COCHRE
@ LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE
@ LXB_HTML_TOKENIZER_ERROR_UNCHINATNA
@ LXB_HTML_TOKENIZER_ERROR_EOINTA
@ LXB_HTML_TOKENIZER_ERROR_UNNUCH
@ LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA
@ LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE
@ LXB_HTML_TOKENIZER_ERROR_INOPCO
@ LXB_HTML_TOKENIZER_ERROR_MIENTANA
@ LXB_HTML_TOKENIZER_ERROR_EOBETANA
@ LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA
@ LXB_HTML_TOKENIZER_ERROR_MIWHBEAT
@ LXB_HTML_TOKENIZER_ERROR_EOINCD
@ LXB_HTML_TOKENIZER_ERROR_UNSOINTA
@ LXB_HTML_TOKENIZER_ERROR_NOCHRE
@ LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA
@ LXB_HTML_TOKENIZER_ERROR_NUCHRE
const lxb_char_t * lxb_html_tokenizer_state_before_attribute_name(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
const lxb_char_t * lxb_html_tokenizer_state_plaintext_before(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
lxb_dom_attr_data_t * lxb_dom_attr_local_name_append(lexbor_hash_t *hash, const lxb_char_t *name, size_t length)
const lxb_char_t * lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
const lxb_char_t * lxb_html_tokenizer_state_data_before(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
const lxb_char_t * lxb_html_tokenizer_state_cr(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
const lxb_tag_data_t * lxb_tag_append_lower(lexbor_hash_t *hash, const lxb_char_t *name, size_t length)
const lxb_char_t * lxb_html_tokenizer_state_self_closing_start_tag(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
#define lxb_html_tokenizer_state_token_attr_set_value_end(tkz, v_end)
#define lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, v_return)
#define lxb_html_tokenizer_state_token_attr_set_name_end_oef(tkz)
#define lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz)
#define lxb_html_tokenizer_state_append_data_m(tkz, v_data)
#define lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, v_begin)
#define lxb_html_tokenizer_state_set_text(tkz)
#define lxb_html_tokenizer_state_token_set_begin(tkz, v_begin)
#define lxb_html_tokenizer_state_append_replace_m(tkz)
#define lxb_html_tokenizer_state_set_name_m(tkz)
#define lxb_html_tokenizer_state_token_done_m(tkz, v_end)
#define lxb_html_tokenizer_state_token_attr_set_name_end(tkz, v_end)
#define lxb_html_tokenizer_state_set_tag_m(tkz, _start, _end)
#define lxb_html_tokenizer_state_token_set_end(tkz, v_end)
#define lxb_html_tokenizer_state_token_set_end_oef(tkz)
#define lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, v_begin)
#define lxb_html_tokenizer_state_token_done_wo_check_m(tkz, v_end)
#define lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, v_end)
#define lxb_html_tokenizer_state_begin_set(tkz, v_data)
#define lxb_html_tokenizer_state_set_value_m(tkz)
#define lxb_html_tokenizer_state_append_m(tkz, v_data, size)
lxb_inline lxb_status_t lxb_html_tokenizer_temp_realloc(lxb_html_tokenizer_t *tkz, size_t size)
unsigned const char * end
unsigned const char * pos
lxb_inline const lexbor_sbst_entry_static_t * lexbor_sbst_entry_static_find(const lexbor_sbst_entry_static_t *strt, const lexbor_sbst_entry_static_t *root, const lxb_char_t key)
const lxb_char_t * lxb_html_tokenizer_state_doctype_before(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
bool lexbor_str_data_ncasecmp(const lxb_char_t *first, const lxb_char_t *sec, size_t size)
const lxb_char_t * lexbor_str_data_ncasecmp_first(const lxb_char_t *first, const lxb_char_t *sec, size_t sec_size)
bool lexbor_str_data_ncmp(const lxb_char_t *first, const lxb_char_t *sec, size_t size)
#define LEXBOR_STR_RES_SLIP
const lxb_char_t * value_begin
lxb_html_token_type_t type
lxb_html_token_attr_t * attr_last
const lxb_char_t * markup
lxb_html_tokenizer_state_f state_return
const lexbor_sbst_entry_static_t * entity
lxb_html_tokenizer_state_f state
lexbor_array_obj_t * parse_errors
const lexbor_sbst_entry_static_t * entity_match
lxb_inline const lxb_char_t * lexbor_swar_seek4(const lxb_char_t *data, const lxb_char_t *end, lxb_char_t c1, lxb_char_t c2, lxb_char_t c3, lxb_char_t c4)
struct lxb_html_token_attr lxb_html_token_attr_t