php-internal-docs 8.4.8
Unofficial docs for php/php-src
Loading...
Searching...
No Matches
tokenizer.c
Go to the documentation of this file.
1/*
2 * Copyright (C) 2018-2020 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
12#include "lexbor/html/tree.h"
13
14#define LXB_HTML_TAG_RES_DATA
15#define LXB_HTML_TAG_RES_SHS_DATA
16#include "lexbor/html/tag_res.h"
17
18
19#define LXB_HTML_TKZ_TEMP_SIZE (4096 * 4)
20
21
22enum {
27};
28
29
31
32
33static lxb_html_token_t *
34lxb_html_tokenizer_token_done(lxb_html_tokenizer_t *tkz,
35 lxb_html_token_t *token, void *ctx);
36
37
43
46{
48
49 if (tkz == NULL) {
51 }
52
53 /* mraw for templary strings or structures */
54 tkz->mraw = lexbor_mraw_create();
55 status = lexbor_mraw_init(tkz->mraw, 1024);
56 if (status != LXB_STATUS_OK) {
57 return status;
58 }
59
60 /* Init Token */
61 tkz->token = NULL;
62
65 4096, sizeof(lxb_html_token_t));
66 if (status != LXB_STATUS_OK) {
67 return status;
68 }
69
70 /* Init Token Attributes */
73 sizeof(lxb_html_token_attr_t));
74 if (status != LXB_STATUS_OK) {
75 return status;
76 }
77
78 /* Parse errors */
82 if (status != LXB_STATUS_OK) {
83 return status;
84 }
85
86 /* Temporary memory for tag name and attributes. */
88 if (tkz->start == NULL) {
90 }
91
92 tkz->pos = tkz->start;
93 tkz->end = tkz->start + LXB_HTML_TKZ_TEMP_SIZE;
94 /* current_line & current_column already initialized by calloc (zero-based) */
95
96 tkz->tree = NULL;
97 tkz->tags = NULL;
98 tkz->attrs = NULL;
99 tkz->attrs_mraw = NULL;
100
102 tkz->state_return = NULL;
103
104 tkz->callback_token_done = lxb_html_tokenizer_token_done;
106
107 tkz->is_eof = false;
108 tkz->status = LXB_STATUS_OK;
109
110 tkz->base = NULL;
111 tkz->ref_count = 1;
112
113 return LXB_STATUS_OK;
114}
115
118 lxb_html_tokenizer_t *tkz_from)
119{
121
122 tkz_to->tags = tkz_from->tags;
123 tkz_to->attrs = tkz_from->attrs;
124 tkz_to->attrs_mraw = tkz_from->attrs_mraw;
125 tkz_to->mraw = tkz_from->mraw;
126
127 /* Token and Attributes */
128 tkz_to->token = NULL;
129
130 tkz_to->dobj_token = tkz_from->dobj_token;
131 tkz_to->dobj_token_attr = tkz_from->dobj_token_attr;
132
133 /* Parse errors */
137 if (status != LXB_STATUS_OK) {
138 return status;
139 }
140
142 tkz_to->state_return = NULL;
143
144 tkz_to->callback_token_done = lxb_html_tokenizer_token_done;
145 tkz_to->callback_token_ctx = NULL;
146
147 tkz_to->is_eof = false;
148 tkz_to->status = LXB_STATUS_OK;
149
150 tkz_to->base = tkz_from;
151 tkz_to->ref_count = 1;
152
153 tkz_to->start = tkz_from->start;
154 tkz_to->end = tkz_from->end;
155 tkz_to->pos = tkz_to->start;
156 tkz_to->current_line = tkz_from->current_line;
157 tkz_to->current_column = tkz_from->current_column;
158
159 return LXB_STATUS_OK;
160}
161
164{
165 if (tkz == NULL) {
166 return NULL;
167 }
168
169 if (tkz->base != NULL) {
170 return lxb_html_tokenizer_ref(tkz->base);
171 }
172
173 tkz->ref_count++;
174
175 return tkz;
176}
177
180{
181 if (tkz == NULL || tkz->ref_count == 0) {
182 return NULL;
183 }
184
185 if (tkz->base != NULL) {
187 }
188
189 tkz->ref_count--;
190
191 if (tkz->ref_count == 0) {
193 }
194
195 return NULL;
196}
197
198void
200{
201 tkz->tree = NULL;
202
204 tkz->state_return = NULL;
205
206 tkz->is_eof = false;
207 tkz->status = LXB_STATUS_OK;
208
209 tkz->pos = tkz->start;
210
214
216}
217
220{
221 if (tkz == NULL) {
222 return NULL;
223 }
224
225 if (tkz->base == NULL) {
228 }
229
232 }
233
234 lexbor_mraw_destroy(tkz->mraw, true);
237 lexbor_free(tkz->start);
238 }
239
241
242 return lexbor_free(tkz);
243}
244
247{
248 tkz->tags = lexbor_hash_create();
249 return lexbor_hash_init(tkz->tags, table_size, sizeof(lxb_tag_data_t));
250}
251
252void
257
260{
261 tkz->attrs = lexbor_hash_create();
262 return lexbor_hash_init(tkz->attrs, table_size,
263 sizeof(lxb_dom_attr_data_t));
264}
265
266void
271
274{
275 if (tkz->tags == NULL) {
276 tkz->status = lxb_html_tokenizer_tags_make(tkz, 256);
277 if (tkz->status != LXB_STATUS_OK) {
278 return tkz->status;
279 }
280
282 }
283
284 if (tkz->attrs == NULL) {
285 tkz->status = lxb_html_tokenizer_attrs_make(tkz, 256);
286 if (tkz->status != LXB_STATUS_OK) {
287 return tkz->status;
288 }
289
291 }
292
293 if (tkz->attrs_mraw == NULL) {
294 tkz->attrs_mraw = tkz->mraw;
295
297 }
298
300 if (tkz->token == NULL) {
302 }
303
304 return LXB_STATUS_OK;
305}
306
309 size_t size)
310{
311 const lxb_char_t *end = data + size;
312
313 tkz->is_eof = false;
314 tkz->status = LXB_STATUS_OK;
315 tkz->last = end;
316
317 while (data < end) {
318 size_t current_column = tkz->current_column;
319 const lxb_char_t *new_data = tkz->state(tkz, data, end);
320 while (data < new_data) {
321 /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
322 if (*data == '\n') {
323 tkz->current_line++;
324 current_column = 0;
325 } else {
326 /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
327 * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
328 if ((*data & 0b11000000) == 0b10000000) {
329 /* Continuation byte, do nothing */
330 } else {
331 /* First byte for a codepoint */
332 current_column++;
333 }
334 }
335 data++;
336 }
337 tkz->current_column = current_column;
338 }
339
340 return tkz->status;
341}
342
345{
346 const lxb_char_t *data, *end;
347
348 tkz->status = LXB_STATUS_OK;
349
350 /* Send a fake EOF data. */
353
354 tkz->is_eof = true;
355
356 while (tkz->state(tkz, data, end) < end) {
357 /* empty loop */
358 }
359
360 tkz->is_eof = false;
361
362 if (tkz->status != LXB_STATUS_OK) {
363 return tkz->status;
364 }
365
366 /* Emit fake token: END OF FILE */
368
370
371 tkz->token = tkz->callback_token_done(tkz, tkz->token,
372 tkz->callback_token_ctx);
373
374 if (tkz->token == NULL && tkz->status == LXB_STATUS_OK) {
376 }
377
378 return tkz->status;
379}
380
381static lxb_html_token_t *
382lxb_html_tokenizer_token_done(lxb_html_tokenizer_t *tkz,
383 lxb_html_token_t *token, void *ctx)
384{
385 return token;
386}
387
390{
391 if (tkz->tree == NULL) {
392 return LXB_NS__UNDEF;
393 }
394
396
397 if (node == NULL) {
398 return LXB_NS__UNDEF;
399 }
400
401 return node->ns;
402}
403
404void
406 lxb_tag_id_t tag_id, lxb_ns_id_t ns)
407{
408 if (ns != LXB_NS_HTML) {
410
411 return;
412 }
413
414 switch (tag_id) {
415 case LXB_TAG_TITLE:
416 case LXB_TAG_TEXTAREA:
417 tkz->tmp_tag_id = tag_id;
419
420 break;
421
422 case LXB_TAG_STYLE:
423 case LXB_TAG_XMP:
424 case LXB_TAG_IFRAME:
425 case LXB_TAG_NOEMBED:
426 case LXB_TAG_NOFRAMES:
427 tkz->tmp_tag_id = tag_id;
429
430 break;
431
432 case LXB_TAG_SCRIPT:
433 tkz->tmp_tag_id = tag_id;
435
436 break;
437
438 case LXB_TAG_NOSCRIPT:
439 if (scripting) {
440 tkz->tmp_tag_id = tag_id;
442
443 return;
444 }
445
447
448 break;
449
452
453 break;
454
455 default:
456 break;
457 }
458}
459
460/*
461 * No inline functions for ABI.
462 */
463void
469
470void
477
478void *
483
484void
490
491void
497
503
504void
510
516
void lexbor_array_obj_clean(lexbor_array_obj_t *array)
Definition array_obj.c:42
lexbor_array_obj_t * lexbor_array_obj_create(void)
Definition array_obj.c:11
lxb_status_t lexbor_array_obj_init(lexbor_array_obj_t *array, size_t size, size_t struct_size)
Definition array_obj.c:17
lexbor_array_obj_t * lexbor_array_obj_destroy(lexbor_array_obj_t *array, bool self_destroy)
Definition array_obj.c:50
@ LXB_STATUS_ERROR_MEMORY_ALLOCATION
Definition base.h:51
@ LXB_STATUS_ERROR_OBJECT_IS_NULL
Definition base.h:52
@ LXB_STATUS_OK
Definition base.h:49
@ LXB_STATUS_ERROR
Definition base.h:50
DNS_STATUS status
Definition dns_win32.c:49
lxb_status_t lexbor_dobject_init(lexbor_dobject_t *dobject, size_t chunk_size, size_t struct_size)
Definition dobject.c:22
void lexbor_dobject_clean(lexbor_dobject_t *dobject)
Definition dobject.c:64
lexbor_dobject_t * lexbor_dobject_destroy(lexbor_dobject_t *dobject, bool destroy_self)
Definition dobject.c:75
lexbor_dobject_t * lexbor_dobject_create(void)
Definition dobject.c:16
struct lxb_dom_node lxb_dom_node_t
Definition interface.h:38
lxb_status_t lexbor_hash_init(lexbor_hash_t *hash, size_t table_size, size_t struct_size)
Definition hash.c:120
lexbor_hash_t * lexbor_hash_create(void)
Definition hash.c:114
lexbor_hash_t * lexbor_hash_destroy(lexbor_hash_t *hash, bool destroy_obj)
Definition hash.c:168
lxb_html_tokenizer_t * lxb_html_tokenizer_ref(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:163
void lxb_html_tokenizer_callback_token_done_set_noi(lxb_html_tokenizer_t *tkz, lxb_html_tokenizer_token_f call_func, void *ctx)
Definition tokenizer.c:471
lxb_status_t lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:45
void lxb_html_tokenizer_status_set_noi(lxb_html_tokenizer_t *tkz, lxb_status_t status)
Definition tokenizer.c:464
lxb_status_t lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, size_t size)
Definition tokenizer.c:308
lxb_status_t lxb_html_tokenizer_attrs_make(lxb_html_tokenizer_t *tkz, size_t table_size)
Definition tokenizer.c:259
lxb_status_t lxb_html_tokenizer_begin(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:273
lxb_html_tokenizer_t * lxb_html_tokenizer_destroy(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:219
lxb_status_t lxb_html_tokenizer_tags_make(lxb_html_tokenizer_t *tkz, size_t table_size)
Definition tokenizer.c:246
lxb_status_t lxb_html_tokenizer_end(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:344
lxb_html_tokenizer_t * lxb_html_tokenizer_unref(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:179
lxb_status_t lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to, lxb_html_tokenizer_t *tkz_from)
Definition tokenizer.c:117
void lxb_html_tokenizer_state_set_noi(lxb_html_tokenizer_t *tkz, lxb_html_tokenizer_state_f state)
Definition tokenizer.c:485
lxb_html_tokenizer_t * lxb_html_tokenizer_create(void)
Definition tokenizer.c:39
void lxb_html_tokenizer_tree_set_noi(lxb_html_tokenizer_t *tkz, lxb_html_tree_t *tree)
Definition tokenizer.c:505
void lxb_html_tokenizer_attrs_destroy(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:267
lxb_html_tree_t * lxb_html_tokenizer_tree_noi(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:499
void lxb_html_tokenizer_set_state_by_tag(lxb_html_tokenizer_t *tkz, bool scripting, lxb_tag_id_t tag_id, lxb_ns_id_t ns)
Definition tokenizer.c:405
lexbor_hash_t * lxb_html_tokenizer_tags_noi(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:518
void lxb_html_tokenizer_clean(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:199
void * lxb_html_tokenizer_callback_token_done_ctx_noi(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:479
lxb_ns_id_t lxb_html_tokenizer_current_namespace(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:389
void lxb_html_tokenizer_tmp_tag_id_set_noi(lxb_html_tokenizer_t *tkz, lxb_tag_id_t tag_id)
Definition tokenizer.c:492
#define LXB_HTML_TKZ_TEMP_SIZE
Definition tokenizer.c:19
lexbor_mraw_t * lxb_html_tokenizer_mraw_noi(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:512
@ LXB_HTML_TOKENIZER_OPT_TAGS_SELF
Definition tokenizer.c:24
@ LXB_HTML_TOKENIZER_OPT_ATTRS_MRAW_SELF
Definition tokenizer.c:26
@ LXB_HTML_TOKENIZER_OPT_ATTRS_SELF
Definition tokenizer.c:25
@ LXB_HTML_TOKENIZER_OPT_UNDEF
Definition tokenizer.c:23
void lxb_html_tokenizer_tags_destroy(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:253
const lxb_char_t * lxb_html_tokenizer_eof
Definition tokenizer.c:30
new_type size
Definition ffi.c:4365
#define NULL
Definition gdcache.h:45
struct lexbor_hash lexbor_hash_t
Definition hash.h:41
struct lxb_html_tokenizer lxb_html_tokenizer_t
Definition base.h:26
struct lxb_html_tree lxb_html_tree_t
Definition base.h:28
lxb_html_token_t * lxb_html_token_create(lexbor_dobject_t *dobj)
Definition token.c:25
lxb_inline void lxb_html_token_clean(lxb_html_token_t *token)
Definition token.h:106
const lxb_char_t * lxb_html_tokenizer_state_plaintext_before(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
Definition state.c:321
const lxb_char_t * lxb_html_tokenizer_state_data_before(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
Definition state.c:204
const lxb_char_t *(* lxb_html_tokenizer_state_f)(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
Definition tokenizer.h:26
lxb_inline void lxb_html_tokenizer_state_set(lxb_html_tokenizer_t *tkz, lxb_html_tokenizer_state_f state)
Definition tokenizer.h:222
lxb_inline lexbor_hash_t * lxb_html_tokenizer_tags(lxb_html_tokenizer_t *tkz)
Definition tokenizer.h:176
lxb_html_token_t *(* lxb_html_tokenizer_token_f)(lxb_html_tokenizer_t *tkz, lxb_html_token_t *token, void *ctx)
Definition tokenizer.h:30
lxb_inline void lxb_html_tokenizer_tree_set(lxb_html_tokenizer_t *tkz, lxb_html_tree_t *tree)
Definition tokenizer.h:242
lxb_inline void lxb_html_tokenizer_tmp_tag_id_set(lxb_html_tokenizer_t *tkz, lxb_tag_id_t tag_id)
Definition tokenizer.h:229
lxb_inline void * lxb_html_tokenizer_callback_token_done_ctx(lxb_html_tokenizer_t *tkz)
Definition tokenizer.h:216
lxb_inline lxb_html_tree_t * lxb_html_tokenizer_tree(lxb_html_tokenizer_t *tkz)
Definition tokenizer.h:236
lxb_inline void lxb_html_tokenizer_callback_token_done_set(lxb_html_tokenizer_t *tkz, lxb_html_tokenizer_token_f call_func, void *ctx)
Definition tokenizer.h:207
lxb_inline void lxb_html_tokenizer_status_set(lxb_html_tokenizer_t *tkz, lxb_status_t status)
Definition tokenizer.h:164
lxb_inline lexbor_mraw_t * lxb_html_tokenizer_mraw(lxb_html_tokenizer_t *tkz)
Definition tokenizer.h:248
LXB_API void * lexbor_free(void *dst)
Definition memory.c:33
LXB_API void * lexbor_malloc(size_t size)
Definition memory.c:15
LXB_API void * lexbor_calloc(size_t num, size_t size)
Definition memory.c:27
lexbor_mraw_t * lexbor_mraw_create(void)
Definition mraw.c:32
void lexbor_mraw_clean(lexbor_mraw_t *mraw)
Definition mraw.c:76
lxb_status_t lexbor_mraw_init(lexbor_mraw_t *mraw, size_t chunk_size)
Definition mraw.c:38
lexbor_mraw_t * lexbor_mraw_destroy(lexbor_mraw_t *mraw, bool destroy_self)
Definition mraw.c:87
uintptr_t lxb_ns_id_t
Definition const.h:20
@ LXB_NS__UNDEF
Definition const.h:24
@ LXB_NS_HTML
Definition const.h:26
unsigned const char * end
Definition php_ffi.h:51
zend_constant * data
const lxb_char_t * lxb_html_tokenizer_state_rawtext_before(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
const lxb_char_t * lxb_html_tokenizer_state_rcdata_before(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
const lxb_char_t * lxb_html_tokenizer_state_script_data_before(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
uintptr_t ns
Definition node.h:48
lxb_tag_id_t tag_id
Definition token.h:48
lxb_html_token_t * token
Definition tokenizer.h:49
lxb_char_t * pos
Definition tokenizer.h:72
lxb_html_tokenizer_state_f state_return
Definition tokenizer.h:36
lxb_html_tokenizer_t * base
Definition tokenizer.h:93
lexbor_hash_t * tags
Definition tokenizer.h:41
lxb_html_tokenizer_opt_t opt
Definition tokenizer.h:89
lxb_html_tokenizer_state_f state
Definition tokenizer.h:35
lexbor_hash_t * attrs
Definition tokenizer.h:42
void * callback_token_ctx
Definition tokenizer.h:39
lexbor_array_obj_t * parse_errors
Definition tokenizer.h:56
lexbor_mraw_t * mraw
Definition tokenizer.h:46
lxb_tag_id_t tmp_tag_id
Definition tokenizer.h:69
lexbor_mraw_t * attrs_mraw
Definition tokenizer.h:43
lxb_html_tokenizer_token_f callback_token_done
Definition tokenizer.h:38
lxb_status_t status
Definition tokenizer.h:90
lxb_char_t * start
Definition tokenizer.h:71
lxb_html_tree_t * tree
Definition tokenizer.h:64
size_t current_column
Definition tokenizer.h:77
const lxb_char_t * end
Definition tokenizer.h:73
lexbor_dobject_t * dobj_token_attr
Definition tokenizer.h:53
lexbor_dobject_t * dobj_token
Definition tokenizer.h:52
const lxb_char_t * last
Definition tokenizer.h:75
@ LXB_TAG_TITLE
Definition const.h:210
@ LXB_TAG_NOSCRIPT
Definition const.h:163
@ LXB_TAG_IFRAME
Definition const.h:127
@ LXB_TAG__END_OF_FILE
Definition const.h:25
@ LXB_TAG_PLAINTEXT
Definition const.h:173
@ LXB_TAG_SCRIPT
Definition const.h:185
@ LXB_TAG_NOEMBED
Definition const.h:161
@ LXB_TAG_TEXTAREA
Definition const.h:204
@ LXB_TAG_XMP
Definition const.h:219
@ LXB_TAG_STYLE
Definition const.h:195
@ LXB_TAG_NOFRAMES
Definition const.h:162
uintptr_t lxb_tag_id_t
Definition const.h:21
struct lxb_html_token_attr lxb_html_token_attr_t
Definition token_attr.h:22
lxb_inline lxb_dom_node_t * lxb_html_tree_adjusted_current_node(lxb_html_tree_t *tree)
Definition tree.h:297
unsigned int lxb_status_t
Definition types.h:28
unsigned char lxb_char_t
Definition types.h:27