php-internal-docs 8.4.8
Unofficial docs for php/php-src
Loading...
Searching...
No Matches
tokenizer.h
Go to the documentation of this file.
1/*
2 * Copyright (C) 2018-2020 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
7#ifndef LEXBOR_HTML_TOKENIZER_H
8#define LEXBOR_HTML_TOKENIZER_H
9
10#ifdef __cplusplus
11extern "C" {
12#endif
13
14#include "lexbor/core/sbst.h"
16
17#include "lexbor/html/base.h"
18#include "lexbor/html/token.h"
19
20#include "lexbor/tag/tag.h"
21#include "lexbor/ns/ns.h"
22
23
24/* State */
25typedef const lxb_char_t *
26(*lxb_html_tokenizer_state_f)(lxb_html_tokenizer_t *tkz,
27 const lxb_char_t *data, const lxb_char_t *end);
28
30(*lxb_html_tokenizer_token_f)(lxb_html_tokenizer_t *tkz,
31 lxb_html_token_t *token, void *ctx);
32
33
37
40
44
45 /* For a temp strings and other templary data */
47
48 /* Current process token */
50
51 /* Memory for token and attr */
54
55 /* Parse error */
57
58 /*
59 * Leak abstractions.
60 * The only place where the specification causes mixing Tree Builder
61 * and Tokenizer. We kill all beauty.
62 * Current Tree parser. This is not ref (not ref count).
63 */
65
66 /* Temp */
70
78
79 /* Entities */
82 uintptr_t entity_start;
83 uintptr_t entity_end;
84 uint32_t entity_length;
85 uint32_t entity_number;
87
88 /* Process */
91 bool is_eof;
92
94 size_t ref_count;
95};
96
97
99
100
102
105
108
111 lxb_html_tokenizer_t *tkz_from);
112
115
118
119LXB_API void
121
124
127
128LXB_API void
130
133
134LXB_API void
136
139
142 const lxb_char_t *data, size_t size);
143
146
147
148LXB_API const lxb_char_t *
150 const lxb_char_t *pos);
151
154
155LXB_API void
157 lxb_tag_id_t tag_id, lxb_ns_id_t ns);
158
159
160/*
161 * Inline functions
162 */
163lxb_inline void
168
169lxb_inline void
174
177{
178 return tkz->tags;
179}
180
181lxb_inline void
186
189{
190 return tkz->attrs;
191}
192
193lxb_inline void
199
205
206lxb_inline void
209 void *ctx)
210{
211 tkz->callback_token_done = call_func;
212 tkz->callback_token_ctx = ctx;
213}
214
215lxb_inline void *
220
221lxb_inline void
227
228lxb_inline void
234
237{
238 return tkz->tree;
239}
240
241lxb_inline void
246
249{
250 return tkz->mraw;
251}
252
255{
256 size_t length = tkz->pos - tkz->start;
257 size_t new_size = (tkz->end - tkz->start) + size + 4096;
258
259 tkz->start = (lxb_char_t *)lexbor_realloc(tkz->start, new_size);
260 if (tkz->start == NULL) {
262 return tkz->status;
263 }
264
265 tkz->pos = tkz->start + length;
266 tkz->end = tkz->start + new_size;
267
268 return LXB_STATUS_OK;
269}
270
273 const lxb_char_t *data)
274{
275 size_t size = data - tkz->begin;
276
277 if ((tkz->pos + size) > tkz->end) {
279 return tkz->status;
280 }
281 }
282
283 tkz->pos = (lxb_char_t *) memcpy(tkz->pos, tkz->begin, size) + size;
284
285 return LXB_STATUS_OK;
286}
287
290 const lxb_char_t *data, size_t size)
291{
292 if ((tkz->pos + size) > tkz->end) {
294 return tkz->status;
295 }
296 }
297
298 tkz->pos = (lxb_char_t *) memcpy(tkz->pos, data, size) + size;
299
300 return LXB_STATUS_OK;
301}
302
303
304/*
305 * No inline functions for ABI.
306 */
307LXB_API void
310
311LXB_API void
314 void *ctx);
315
316LXB_API void *
318
319LXB_API void
322
323LXB_API void
325 lxb_tag_id_t tag_id);
326
329
330LXB_API void
332 lxb_html_tree_t *tree);
333
336
339
340
341#ifdef __cplusplus
342} /* extern "C" */
343#endif
344
345#endif /* LEXBOR_HTML_TOKENIZER_H */
@ LXB_STATUS_ERROR_MEMORY_ALLOCATION
Definition base.h:51
@ LXB_STATUS_OK
Definition base.h:49
#define LXB_API
Definition def.h:48
DNS_STATUS status
Definition dns_win32.c:49
const lxb_char_t * lxb_html_tokenizer_eof
Definition tokenizer.c:30
ffi tags
Definition ffi.c:3115
new_type size
Definition ffi.c:4365
memcpy(ptr1, ptr2, size)
#define NULL
Definition gdcache.h:45
struct lexbor_hash lexbor_hash_t
Definition hash.h:41
unsigned int lxb_html_tokenizer_opt_t
Definition base.h:27
struct lxb_html_tokenizer lxb_html_tokenizer_t
Definition base.h:26
struct lxb_html_tree lxb_html_tree_t
Definition base.h:28
LXB_API void lxb_html_tokenizer_callback_token_done_set_noi(lxb_html_tokenizer_t *tkz, lxb_html_tokenizer_token_f call_func, void *ctx)
Definition tokenizer.c:471
LXB_API lxb_html_tokenizer_t * lxb_html_tokenizer_create(void)
Definition tokenizer.c:39
LXB_API lxb_html_tokenizer_t * lxb_html_tokenizer_unref(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:179
LXB_API void * lxb_html_tokenizer_callback_token_done_ctx_noi(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:479
const lxb_char_t *(* lxb_html_tokenizer_state_f)(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
Definition tokenizer.h:26
lxb_inline lxb_status_t lxb_html_tokenizer_temp_append_data(lxb_html_tokenizer_t *tkz, const lxb_char_t *data)
Definition tokenizer.h:272
LXB_API lexbor_mraw_t * lxb_html_tokenizer_mraw_noi(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:512
LXB_API lxb_status_t lxb_html_tokenizer_attrs_make(lxb_html_tokenizer_t *tkz, size_t table_size)
Definition tokenizer.c:259
LXB_API lxb_status_t lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to, lxb_html_tokenizer_t *tkz_from)
Definition tokenizer.c:117
lxb_inline lexbor_mraw_t * lxb_html_tokenizer_attrs_mraw(lxb_html_tokenizer_t *tkz)
Definition tokenizer.h:201
lxb_inline void lxb_html_tokenizer_attrs_set(lxb_html_tokenizer_t *tkz, lexbor_hash_t *attrs)
Definition tokenizer.h:182
lxb_inline void lxb_html_tokenizer_state_set(lxb_html_tokenizer_t *tkz, lxb_html_tokenizer_state_f state)
Definition tokenizer.h:222
lxb_inline lexbor_hash_t * lxb_html_tokenizer_tags(lxb_html_tokenizer_t *tkz)
Definition tokenizer.h:176
lxb_html_token_t *(* lxb_html_tokenizer_token_f)(lxb_html_tokenizer_t *tkz, lxb_html_token_t *token, void *ctx)
Definition tokenizer.h:30
lxb_inline void lxb_html_tokenizer_tree_set(lxb_html_tokenizer_t *tkz, lxb_html_tree_t *tree)
Definition tokenizer.h:242
LXB_API lxb_html_tokenizer_t * lxb_html_tokenizer_ref(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:163
LXB_API lexbor_hash_t * lxb_html_tokenizer_tags_noi(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:518
lxb_inline void lxb_html_tokenizer_tmp_tag_id_set(lxb_html_tokenizer_t *tkz, lxb_tag_id_t tag_id)
Definition tokenizer.h:229
LXB_API lxb_html_tree_t * lxb_html_tokenizer_tree_noi(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:499
LXB_API lxb_status_t lxb_html_tokenizer_begin(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:273
lxb_inline lexbor_hash_t * lxb_html_tokenizer_attrs(lxb_html_tokenizer_t *tkz)
Definition tokenizer.h:188
LXB_API void lxb_html_tokenizer_tags_destroy(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:253
LXB_API void lxb_html_tokenizer_clean(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:199
LXB_API lxb_status_t lxb_html_tokenizer_end(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:344
lxb_inline void * lxb_html_tokenizer_callback_token_done_ctx(lxb_html_tokenizer_t *tkz)
Definition tokenizer.h:216
LXB_API void lxb_html_tokenizer_status_set_noi(lxb_html_tokenizer_t *tkz, lxb_status_t status)
Definition tokenizer.c:464
lxb_inline lxb_status_t lxb_html_tokenizer_temp_realloc(lxb_html_tokenizer_t *tkz, size_t size)
Definition tokenizer.h:254
LXB_API void lxb_html_tokenizer_tree_set_noi(lxb_html_tokenizer_t *tkz, lxb_html_tree_t *tree)
Definition tokenizer.c:505
LXB_API lxb_status_t lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, size_t size)
Definition tokenizer.c:308
lxb_inline lxb_status_t lxb_html_tokenizer_temp_append(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, size_t size)
Definition tokenizer.h:289
LXB_API lxb_html_tokenizer_t * lxb_html_tokenizer_destroy(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:219
LXB_API lxb_status_t lxb_html_tokenizer_tags_make(lxb_html_tokenizer_t *tkz, size_t table_size)
Definition tokenizer.c:246
LXB_API void lxb_html_tokenizer_attrs_destroy(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:267
LXB_API lxb_ns_id_t lxb_html_tokenizer_current_namespace(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:389
LXB_API void lxb_html_tokenizer_state_set_noi(lxb_html_tokenizer_t *tkz, lxb_html_tokenizer_state_f state)
Definition tokenizer.c:485
lxb_inline lxb_html_tree_t * lxb_html_tokenizer_tree(lxb_html_tokenizer_t *tkz)
Definition tokenizer.h:236
LXB_API lxb_status_t lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:45
LXB_API void lxb_html_tokenizer_set_state_by_tag(lxb_html_tokenizer_t *tkz, bool scripting, lxb_tag_id_t tag_id, lxb_ns_id_t ns)
Definition tokenizer.c:405
lxb_inline void lxb_html_tokenizer_attrs_mraw_set(lxb_html_tokenizer_t *tkz, lexbor_mraw_t *mraw)
Definition tokenizer.h:194
LXB_API void lxb_html_tokenizer_tmp_tag_id_set_noi(lxb_html_tokenizer_t *tkz, lxb_tag_id_t tag_id)
Definition tokenizer.c:492
lxb_inline void lxb_html_tokenizer_callback_token_done_set(lxb_html_tokenizer_t *tkz, lxb_html_tokenizer_token_f call_func, void *ctx)
Definition tokenizer.h:207
lxb_inline void lxb_html_tokenizer_tags_set(lxb_html_tokenizer_t *tkz, lexbor_hash_t *tags)
Definition tokenizer.h:170
LXB_API const lxb_char_t * lxb_html_tokenizer_change_incoming(lxb_html_tokenizer_t *tkz, const lxb_char_t *pos)
lxb_inline void lxb_html_tokenizer_status_set(lxb_html_tokenizer_t *tkz, lxb_status_t status)
Definition tokenizer.h:164
lxb_inline lexbor_mraw_t * lxb_html_tokenizer_mraw(lxb_html_tokenizer_t *tkz)
Definition tokenizer.h:248
LXB_API void * lexbor_realloc(void *dst, size_t size)
Definition memory.c:21
uintptr_t lxb_ns_id_t
Definition const.h:20
unsigned const char * end
Definition php_ffi.h:51
unsigned const char * pos
Definition php_ffi.h:52
zend_constant * data
Definition sbst.h:19
lxb_html_token_t * token
Definition tokenizer.h:49
const lxb_char_t * markup
Definition tokenizer.h:67
uintptr_t entity_start
Definition tokenizer.h:82
lxb_char_t * pos
Definition tokenizer.h:72
lxb_html_tokenizer_state_f state_return
Definition tokenizer.h:36
const lexbor_sbst_entry_static_t * entity
Definition tokenizer.h:80
lxb_html_tokenizer_t * base
Definition tokenizer.h:93
const lxb_char_t * temp
Definition tokenizer.h:68
lexbor_hash_t * tags
Definition tokenizer.h:41
lxb_html_tokenizer_opt_t opt
Definition tokenizer.h:89
lxb_html_tokenizer_state_f state
Definition tokenizer.h:35
uint32_t entity_number
Definition tokenizer.h:85
const lxb_char_t * begin
Definition tokenizer.h:74
lexbor_hash_t * attrs
Definition tokenizer.h:42
void * callback_token_ctx
Definition tokenizer.h:39
lexbor_array_obj_t * parse_errors
Definition tokenizer.h:56
lexbor_mraw_t * mraw
Definition tokenizer.h:46
lxb_tag_id_t tmp_tag_id
Definition tokenizer.h:69
uint32_t entity_length
Definition tokenizer.h:84
lexbor_mraw_t * attrs_mraw
Definition tokenizer.h:43
lxb_html_tokenizer_token_f callback_token_done
Definition tokenizer.h:38
uintptr_t entity_end
Definition tokenizer.h:83
lxb_status_t status
Definition tokenizer.h:90
lxb_char_t * start
Definition tokenizer.h:71
lxb_html_tree_t * tree
Definition tokenizer.h:64
size_t current_column
Definition tokenizer.h:77
const lxb_char_t * end
Definition tokenizer.h:73
const lexbor_sbst_entry_static_t * entity_match
Definition tokenizer.h:81
lexbor_dobject_t * dobj_token_attr
Definition tokenizer.h:53
lexbor_dobject_t * dobj_token
Definition tokenizer.h:52
const lxb_char_t * last
Definition tokenizer.h:75
uintptr_t lxb_tag_id_t
Definition const.h:21
unsigned int lxb_status_t
Definition types.h:28
#define lxb_inline
Definition types.h:21
unsigned char lxb_char_t
Definition types.h:27