php-internal-docs 8.4.8
Unofficial docs for php/php-src
Loading...
Searching...
No Matches
encoding.c
Go to the documentation of this file.
1/*
2 * Copyright (C) 2019 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
8
9#include "lexbor/core/str.h"
10
11
12static const lxb_char_t *
13lxb_html_encoding_meta(lxb_html_encoding_t *em,
14 const lxb_char_t *data, const lxb_char_t *end);
15
16static const lxb_char_t *
17lxb_html_get_attribute(const lxb_char_t *data, const lxb_char_t *end,
18 const lxb_char_t **name, const lxb_char_t **name_end,
19 const lxb_char_t **value, const lxb_char_t **value_end);
20
21
24{
25 for (; data < end; data++) {
26 switch (*data) {
27 case 0x09: case 0x0A:
28 case 0x0C: case 0x0D:
29 case 0x20:
30 break;
31
32 default:
33 return data;
34 }
35 }
36
37 return end;
38}
39
42{
43 for (; data < end; data++) {
44 switch (*data) {
45 case 0x09: case 0x0A:
46 case 0x0C: case 0x0D:
47 case 0x20: case 0x3E:
48 return data;
49 }
50 }
51
52 return end;
53}
54
57{
58 data = memchr(data, '>', (end - data));
59 if (data == NULL) {
60 return end;
61 }
62
63 return data + 1;
64}
65
68{
70
71 if (em == NULL) {
73 }
74
77 if (status != LXB_STATUS_OK) {
78 return status;
79 }
80
81 return lexbor_array_obj_init(&em->result, 12,
83}
84
87{
88 if (em == NULL) {
89 return NULL;
90 }
91
94
95 if (self_destroy) {
96 return lexbor_free(em);
97 }
98
99 return em;
100}
101
104 const lxb_char_t *data, const lxb_char_t *end)
105{
106 const lxb_char_t *name, *name_end;
107 const lxb_char_t *value, *value_end;
108
109 while (data < end) {
110 /* Find tag beginning */
111 data = memchr(data, '<', (end - data));
112 if (data == NULL) {
113 return LXB_STATUS_OK;
114 }
115
116 if (++data == end) {
117 return LXB_STATUS_OK;
118 }
119
120 switch (*data) {
121 /* Comment or broken tag */
122 case '!':
123 if ((data + 5) > end) {
124 return LXB_STATUS_OK;
125 }
126
127 if (data[1] != '-' || data[2] != '-') {
129 continue;
130 }
131
132 while (data < end) {
134
135 if (data[-3] == '-' && data[-2] == '-') {
136 break;
137 }
138 }
139
140 break;
141
142 case '?':
144 break;
145
146 case '/':
147 data++;
148
149 if ((data + 3) > end) {
150 return LXB_STATUS_OK;
151 }
152
153 if ((unsigned) (*data - 0x41) <= (0x5A - 0x41)
154 || (unsigned) (*data - 0x61) <= (0x7A - 0x61))
155 {
156 goto skip_attributes;
157 }
158
160 break;
161
162 default:
163
164 if ((unsigned) (*data - 0x41) > (0x5A - 0x41)
165 && (unsigned) (*data - 0x61) > (0x7A - 0x61))
166 {
167 break;
168 }
169
170 if ((data + 6) > end) {
171 return LXB_STATUS_OK;
172 }
173
174 if (!lexbor_str_data_ncasecmp(data, (lxb_char_t *) "meta", 4)) {
175 goto skip_attributes;
176 }
177
178 data += 4;
179
180 switch (*data++) {
181 case 0x09: case 0x0A: case 0x0C:
182 case 0x0D: case 0x20: case 0x2F:
183 break;
184
185 default:
186 goto skip_attributes;
187 }
188
189 data = lxb_html_encoding_meta(em, data, end);
190 if (data == NULL) {
192 }
193
194 break;
195
196 skip_attributes:
197
199 if (data >= end) {
200 return LXB_STATUS_OK;
201 }
202
203 if (*data == '>') {
204 data++;
205 continue;
206 }
207
208 /* Skip attributes */
209 while (data < end) {
210 data = lxb_html_get_attribute(data, end, &name, &name_end,
211 &value, &value_end);
212 if (name == NULL) {
213 break;
214 }
215 }
216
217 break;
218 }
219 }
220
221 return LXB_STATUS_OK;
222}
223
224static const lxb_char_t *
225lxb_html_encoding_meta(lxb_html_encoding_t *em,
226 const lxb_char_t *data, const lxb_char_t *end)
227{
228 size_t i, len, cur;
229 bool got_pragma, have_content;
230 uint8_t need_pragma;
231 const lxb_char_t *name, *name_end;
232 const lxb_char_t *value, *value_end;
234
235 got_pragma = false;
236 have_content = false;
237 need_pragma = 0x00;
239
241
242 while (data < end) {
243
244 find_attr:
245
246 data = lxb_html_get_attribute(data, end, &name, &name_end,
247 &value, &value_end);
248 if (name == NULL) {
249 break;
250 }
251
252 len = name_end - name;
253
254 if (len < 7) {
255 continue;
256 }
257
258 /* Exists check */
259 for (i = 0; i < lexbor_array_obj_length(&em->cache); i++) {
261
262 if ((size_t) (attr->end - attr->name) == len
264 {
265 goto find_attr;
266 }
267 }
268
269 /* Append attribute to cache */
271 if (attr == NULL) {
272 return NULL;
273 }
274
275 attr->name = name;
276 attr->end = name_end;
277
278 if (value == NULL) {
279 continue;
280 }
281
282 /* http-equiv check */
283 if (len == (sizeof("http-equiv") - 1)) {
284 if (!lexbor_str_data_ncasecmp((lxb_char_t *) "http-equiv", name, len)) {
285 continue;
286 }
287
288 if ((value_end - value) == (sizeof("content-type") - 1)
289 && lexbor_str_data_ncasecmp((lxb_char_t *) "content-type",
290 value, (sizeof("content-type") - 1)))
291 {
292 got_pragma = true;
293 }
294
295 continue;
296 }
297
298 if (lexbor_str_data_ncasecmp((lxb_char_t *) "content", name, 7)) {
299 if (have_content == false) {
300
301 name = lxb_html_encoding_content(value, value_end, &name_end);
302 if (name == NULL) {
303 continue;
304 }
305
307 if (attr == NULL) {
308 return NULL;
309 }
310
311 attr->name = name;
312 attr->end = name_end;
313
314 need_pragma = 0x02;
315 have_content = true;
316 }
317
318 continue;
319 }
320
321 if (lexbor_str_data_ncasecmp((lxb_char_t *) "charset", name, 7)) {
323 if (attr == NULL) {
324 return NULL;
325 }
326
327 attr->name = value;
328 attr->end = value_end;
329
330 need_pragma = 0x01;
331 }
332 }
333
334 if (need_pragma == 0x00 || (need_pragma == 0x02 && got_pragma == false)) {
335 if (cur != lexbor_array_obj_length(&em->result)) {
337 }
338 }
339
340 return data;
341}
342
343const lxb_char_t *
345 const lxb_char_t **name_end)
346{
347 const lxb_char_t *name;
348
349 do {
350 for (; (data + 7) < end; data++) {
351 if (lexbor_str_data_ncasecmp((lxb_char_t *) "charset", data, 7)) {
352 goto found;
353 }
354 }
355
356 return NULL;
357
358 found:
359
361 if (data >= end) {
362 return NULL;
363 }
364
365 if (*data != '=') {
366 continue;
367 }
368
370 if (data >= end) {
371 return NULL;
372 }
373
374 break;
375 }
376 while (true);
377
378 if (*data == '\'' || *data == '"') {
379 *name_end = data++;
380 name = data;
381
382 for (; data < end; data++) {
383 if (*data == **name_end) {
384 break;
385 }
386 }
387
388 *name_end = data;
389 goto done;
390 }
391
392 name = data;
393 *name_end = data;
394
395 for (; data < end; data++) {
396 switch (*data) {
397 case ';':
398 goto done;
399
400 case 0x09: case 0x0A:
401 case 0x0C: case 0x0D:
402 case 0x20:
403 goto done;
404
405 case '"':
406 case '\'':
407 return NULL;
408 }
409 }
410
411 if (data == name) {
412 return NULL;
413 }
414
415done:
416
417 *name_end = data;
418
419 return name;
420}
421
422static const lxb_char_t *
423lxb_html_get_attribute(const lxb_char_t *data, const lxb_char_t *end,
424 const lxb_char_t **name, const lxb_char_t **name_end,
425 const lxb_char_t **value, const lxb_char_t **value_end)
426{
428
429 *name = NULL;
430 *value = NULL;
431
432 for (; data < end; data++) {
433 switch (*data) {
434 case 0x09: case 0x0A:
435 case 0x0C: case 0x0D:
436 case 0x20: case 0x2F:
437 break;
438
439 case 0x3E:
440 return (data + 1);
441
442 default:
443 goto name_state;
444 }
445 }
446
447 if (data == end) {
448 return data;
449 }
450
451name_state:
452
453 /* Attribute name */
454 *name = data;
455
456 while (data < end) {
457 switch (*data) {
458 case 0x09: case 0x0A:
459 case 0x0C: case 0x0D:
460 case 0x20:
461 *name_end = data;
462
463 data++;
464 goto spaces_state;
465
466 case '/': case '>':
467 *name_end = data;
468 return data;
469
470 case '=':
471 if (*name != NULL) {
472 *name_end = data++;
473 goto value_state;
474 }
475 }
476
477 data++;
478 }
479
480 *name_end = data;
481
482spaces_state:
483
485 if (data == end) {
486 return data;
487 }
488
489 if (*data != '=') {
490 return data;
491 }
492
493 data += 1;
494
495value_state:
496
498 if (data == end) {
499 return data;
500 }
501
502 switch (*data) {
503 case '"':
504 case '\'':
505 ch = *data++;
506 if (data == end) {
507 return data;
508 }
509
510 *value = data;
511
512 do {
513 if (*data == ch) {
514 *value_end = data;
515 return data + 1;
516 }
517 }
518 while (++data < end);
519
520 *value = NULL;
521
522 return data;
523
524 case '>':
525 return data;
526
527 default:
528 *value = data++;
529 break;
530 }
531
532 for (; data < end; data++) {
533 switch (*data) {
534 case 0x09: case 0x0A:
535 case 0x0C: case 0x0D:
536 case 0x20: case 0x3E:
537 *value_end = data;
538 return data;
539 }
540 }
541
542 *value = NULL;
543
544 return data;
545}
546
547/*
548 * No inline functions for ABI.
549 */
555
556void
561
567
568size_t
573
size_t len
Definition apprentice.c:174
void * lexbor_array_obj_pop(lexbor_array_obj_t *array)
Definition array_obj.c:147
void * lexbor_array_obj_push(lexbor_array_obj_t *array)
Definition array_obj.c:93
void lexbor_array_obj_clean(lexbor_array_obj_t *array)
Definition array_obj.c:42
lxb_status_t lexbor_array_obj_init(lexbor_array_obj_t *array, size_t size, size_t struct_size)
Definition array_obj.c:17
lexbor_array_obj_t * lexbor_array_obj_destroy(lexbor_array_obj_t *array, bool self_destroy)
Definition array_obj.c:50
lxb_inline size_t lexbor_array_obj_length(lexbor_array_obj_t *array)
Definition array_obj.h:80
lxb_inline void * lexbor_array_obj_get(const lexbor_array_obj_t *array, size_t idx)
Definition array_obj.h:70
@ LXB_STATUS_ERROR_MEMORY_ALLOCATION
Definition base.h:51
@ LXB_STATUS_ERROR_WRONG_ARGS
Definition base.h:58
@ LXB_STATUS_OK
Definition base.h:49
DNS_STATUS status
Definition dns_win32.c:49
lxb_inline const lxb_char_t * lxb_html_encoding_tag_end(const lxb_char_t *data, const lxb_char_t *end)
Definition encoding.c:56
lxb_html_encoding_entry_t * lxb_html_encoding_meta_entry_noi(lxb_html_encoding_t *em, size_t idx)
Definition encoding.c:563
lxb_inline const lxb_char_t * lxb_html_encoding_skip_spaces(const lxb_char_t *data, const lxb_char_t *end)
Definition encoding.c:23
lxb_html_encoding_t * lxb_html_encoding_create_noi(void)
Definition encoding.c:551
lxb_status_t lxb_html_encoding_init(lxb_html_encoding_t *em)
Definition encoding.c:67
lexbor_array_obj_t * lxb_html_encoding_meta_result_noi(lxb_html_encoding_t *em)
Definition encoding.c:575
const lxb_char_t * lxb_html_encoding_content(const lxb_char_t *data, const lxb_char_t *end, const lxb_char_t **name_end)
Definition encoding.c:344
lxb_html_encoding_t * lxb_html_encoding_destroy(lxb_html_encoding_t *em, bool self_destroy)
Definition encoding.c:86
lxb_inline const lxb_char_t * lxb_html_encoding_skip_name(const lxb_char_t *data, const lxb_char_t *end)
Definition encoding.c:41
lxb_status_t lxb_html_encoding_determine(lxb_html_encoding_t *em, const lxb_char_t *data, const lxb_char_t *end)
Definition encoding.c:103
size_t lxb_html_encoding_meta_length_noi(lxb_html_encoding_t *em)
Definition encoding.c:569
void lxb_html_encoding_clean_noi(lxb_html_encoding_t *em)
Definition encoding.c:557
zend_long ch
Definition ffi.c:4580
new_type attr
Definition ffi.c:4364
#define NULL
Definition gdcache.h:45
lxb_inline size_t lxb_html_encoding_meta_length(lxb_html_encoding_t *em)
Definition encoding.h:72
lxb_inline void lxb_html_encoding_clean(lxb_html_encoding_t *em)
Definition encoding.h:59
lxb_inline lxb_html_encoding_t * lxb_html_encoding_create(void)
Definition encoding.h:52
lxb_inline lexbor_array_obj_t * lxb_html_encoding_meta_result(lxb_html_encoding_t *em)
Definition encoding.h:78
lxb_inline lxb_html_encoding_entry_t * lxb_html_encoding_meta_entry(lxb_html_encoding_t *em, size_t idx)
Definition encoding.h:66
LXB_API void * lexbor_free(void *dst)
Definition memory.c:33
unsigned const char * end
Definition php_ffi.h:51
zend_constant * data
bool lexbor_str_data_ncasecmp(const lxb_char_t *first, const lxb_char_t *sec, size_t size)
Definition str.c:435
Definition encoding.h:19
lexbor_array_obj_t result
Definition encoding.h:27
lexbor_array_obj_t cache
Definition encoding.h:26
unsigned int lxb_status_t
Definition types.h:28
#define lxb_inline
Definition types.h:21
unsigned char lxb_char_t
Definition types.h:27
zend_string * name
value