php-internal-docs 8.4.8
Unofficial docs for php/php-src
Loading...
Searching...
No Matches
state.c
Go to the documentation of this file.
1/*
2 * Copyright (C) 2018-2020 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
10
11#define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
12#define LEXBOR_STR_RES_ALPHANUMERIC_CHARACTER
13#define LEXBOR_STR_RES_REPLACEMENT_CHARACTER
14#define LEXBOR_STR_RES_ALPHA_CHARACTER
15#define LEXBOR_STR_RES_MAP_HEX
16#define LEXBOR_STR_RES_MAP_NUM
17#include "lexbor/core/str_res.h"
18#include "lexbor/core/swar.h"
19
20#define LXB_HTML_TOKENIZER_RES_ENTITIES_SBST
22
23
24const lxb_tag_data_t *
26 const lxb_char_t *name, size_t length);
27
30 const lxb_char_t *name, size_t length);
31
32
33static const lxb_char_t *
34lxb_html_tokenizer_state_data(lxb_html_tokenizer_t *tkz,
35 const lxb_char_t *data, const lxb_char_t *end);
36
37static const lxb_char_t *
38lxb_html_tokenizer_state_plaintext(lxb_html_tokenizer_t *tkz,
39 const lxb_char_t *data,
40 const lxb_char_t *end);
41
42/* Tag */
43static const lxb_char_t *
44lxb_html_tokenizer_state_tag_open(lxb_html_tokenizer_t *tkz,
45 const lxb_char_t *data,
46 const lxb_char_t *end);
47
48static const lxb_char_t *
49lxb_html_tokenizer_state_end_tag_open(lxb_html_tokenizer_t *tkz,
50 const lxb_char_t *data,
51 const lxb_char_t *end);
52
53static const lxb_char_t *
54lxb_html_tokenizer_state_tag_name(lxb_html_tokenizer_t *tkz,
55 const lxb_char_t *data,
56 const lxb_char_t *end);
57
58/* Attribute */
59static const lxb_char_t *
60lxb_html_tokenizer_state_attribute_name(lxb_html_tokenizer_t *tkz,
61 const lxb_char_t *data,
62 const lxb_char_t *end);
63
64static const lxb_char_t *
65lxb_html_tokenizer_state_after_attribute_name(lxb_html_tokenizer_t *tkz,
66 const lxb_char_t *data,
67 const lxb_char_t *end);
68
69static const lxb_char_t *
70lxb_html_tokenizer_state_before_attribute_value(lxb_html_tokenizer_t *tkz,
71 const lxb_char_t *data,
72 const lxb_char_t *end);
73
74static const lxb_char_t *
75lxb_html_tokenizer_state_attribute_value_double_quoted(lxb_html_tokenizer_t *tkz,
76 const lxb_char_t *data,
77 const lxb_char_t *end);
78
79static const lxb_char_t *
80lxb_html_tokenizer_state_attribute_value_single_quoted(lxb_html_tokenizer_t *tkz,
81 const lxb_char_t *data,
82 const lxb_char_t *end);
83
84static const lxb_char_t *
85lxb_html_tokenizer_state_attribute_value_unquoted(lxb_html_tokenizer_t *tkz,
86 const lxb_char_t *data,
87 const lxb_char_t *end);
88
89static const lxb_char_t *
90lxb_html_tokenizer_state_after_attribute_value_quoted(lxb_html_tokenizer_t *tkz,
91 const lxb_char_t *data,
92 const lxb_char_t *end);
93
94static const lxb_char_t *
95lxb_html_tokenizer_state_bogus_comment_before(lxb_html_tokenizer_t *tkz,
96 const lxb_char_t *data,
97 const lxb_char_t *end);
98
99static const lxb_char_t *
100lxb_html_tokenizer_state_bogus_comment(lxb_html_tokenizer_t *tkz,
101 const lxb_char_t *data,
102 const lxb_char_t *end);
103
104/* Markup declaration */
105static const lxb_char_t *
106lxb_html_tokenizer_state_markup_declaration_open(lxb_html_tokenizer_t *tkz,
107 const lxb_char_t *data,
108 const lxb_char_t *end);
109
110static const lxb_char_t *
111lxb_html_tokenizer_state_markup_declaration_comment(lxb_html_tokenizer_t *tkz,
112 const lxb_char_t *data,
113 const lxb_char_t *end);
114
115static const lxb_char_t *
116lxb_html_tokenizer_state_markup_declaration_doctype(lxb_html_tokenizer_t *tkz,
117 const lxb_char_t *data,
118 const lxb_char_t *end);
119
120static const lxb_char_t *
121lxb_html_tokenizer_state_markup_declaration_cdata(lxb_html_tokenizer_t *tkz,
122 const lxb_char_t *data,
123 const lxb_char_t *end);
124
125/* CDATA Section */
126static const lxb_char_t *
127lxb_html_tokenizer_state_cdata_section_before(lxb_html_tokenizer_t *tkz,
128 const lxb_char_t *data,
129 const lxb_char_t *end);
130
131static const lxb_char_t *
132lxb_html_tokenizer_state_cdata_section(lxb_html_tokenizer_t *tkz,
133 const lxb_char_t *data,
134 const lxb_char_t *end);
135
136static const lxb_char_t *
137lxb_html_tokenizer_state_cdata_section_bracket(lxb_html_tokenizer_t *tkz,
138 const lxb_char_t *data,
139 const lxb_char_t *end);
140
141static const lxb_char_t *
142lxb_html_tokenizer_state_cdata_section_end(lxb_html_tokenizer_t *tkz,
143 const lxb_char_t *data,
144 const lxb_char_t *end);
145
146static const lxb_char_t *
147lxb_html_tokenizer_state_char_ref_attr(lxb_html_tokenizer_t *tkz,
148 const lxb_char_t *data,
149 const lxb_char_t *end);
150
151static const lxb_char_t *
152_lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
153 const lxb_char_t *data,
154 const lxb_char_t *end);
155
156static const lxb_char_t *
157lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t *tkz,
158 const lxb_char_t *data,
159 const lxb_char_t *end);
160
161static const lxb_char_t *
162lxb_html_tokenizer_state_char_ref_ambiguous_ampersand(lxb_html_tokenizer_t *tkz,
163 const lxb_char_t *data,
164 const lxb_char_t *end);
165
166static const lxb_char_t *
167lxb_html_tokenizer_state_char_ref_numeric(lxb_html_tokenizer_t *tkz,
168 const lxb_char_t *data,
169 const lxb_char_t *end);
170
171static const lxb_char_t *
172lxb_html_tokenizer_state_char_ref_hexademical_start(lxb_html_tokenizer_t *tkz,
173 const lxb_char_t *data,
174 const lxb_char_t *end);
175
176static const lxb_char_t *
177lxb_html_tokenizer_state_char_ref_decimal_start(lxb_html_tokenizer_t *tkz,
178 const lxb_char_t *data,
179 const lxb_char_t *end);
180
181static const lxb_char_t *
182lxb_html_tokenizer_state_char_ref_hexademical(lxb_html_tokenizer_t *tkz,
183 const lxb_char_t *data,
184 const lxb_char_t *end);
185
186static const lxb_char_t *
187lxb_html_tokenizer_state_char_ref_decimal(lxb_html_tokenizer_t *tkz,
188 const lxb_char_t *data,
189 const lxb_char_t *end);
190
191static const lxb_char_t *
192lxb_html_tokenizer_state_char_ref_numeric_end(lxb_html_tokenizer_t *tkz,
193 const lxb_char_t *data,
194 const lxb_char_t *end);
195
196static size_t
197lxb_html_tokenizer_state_to_ascii_utf_8(size_t codepoint, lxb_char_t *data);
198
199
200/*
201 * Helper function. No in the specification. For 12.2.5.1 Data state
202 */
203const lxb_char_t *
205 const lxb_char_t *data,
206 const lxb_char_t *end)
207{
208 if (tkz->is_eof == false) {
210 }
211
212 /*
213 * Text node init param sets before emit token.
214 */
215
216 tkz->state = lxb_html_tokenizer_state_data;
217
218 return data;
219}
220
221/*
222 * 12.2.5.1 Data state
223 */
224static const lxb_char_t *
225lxb_html_tokenizer_state_data(lxb_html_tokenizer_t *tkz,
226 const lxb_char_t *data, const lxb_char_t *end)
227{
229
230 data = lexbor_swar_seek4(data, end, 0x3C, 0x26, 0x0D, 0x00);
231
232 while (data != end) {
233 switch (*data) {
234 /* U+003C LESS-THAN SIGN (<) */
235 case 0x3C:
238
239 tkz->state = lxb_html_tokenizer_state_tag_open;
240 return (data + 1);
241
242 /* U+0026 AMPERSAND (&) */
243 case 0x26:
245
247 tkz->state_return = lxb_html_tokenizer_state_data;
248
249 return data + 1;
250
251 /* U+000D CARRIAGE RETURN (CR) */
252 case 0x0D:
253 if (++data >= end) {
255
257 tkz->state_return = lxb_html_tokenizer_state_data;
258
259 return data;
260 }
261
263 tkz->pos[-1] = 0x0A;
264
266
267 if (*data != 0x0A) {
269 data--;
270 }
271
272 break;
273
274 /*
275 * U+0000 NULL
276 * EOF
277 */
278 case 0x00:
279 if (tkz->is_eof) {
280 /* Emit TEXT node if not empty */
281 if (tkz->token->begin != NULL) {
283 }
284
285 if (tkz->token->begin != tkz->token->end) {
286 tkz->token->tag_id = LXB_TAG__TEXT;
287
289
292 }
293
294 return end;
295 }
296
297 if (SIZE_MAX - tkz->token->null_count < 1) {
299 return end;
300 }
301
302 tkz->token->null_count++;
303
306 break;
307 }
308
309 data++;
310 }
311
313
314 return data;
315}
316
317/*
318 * Helper function. No in the specification. For 12.2.5.5 PLAINTEXT state
319 */
320const lxb_char_t *
322 const lxb_char_t *data,
323 const lxb_char_t *end)
324{
325 if (tkz->is_eof == false) {
327 }
328
329 tkz->token->tag_id = LXB_TAG__TEXT;
330
331 tkz->state = lxb_html_tokenizer_state_plaintext;
332
333 return data;
334}
335
336/*
337 * 12.2.5.5 PLAINTEXT state
338 */
339static const lxb_char_t *
340lxb_html_tokenizer_state_plaintext(lxb_html_tokenizer_t *tkz,
341 const lxb_char_t *data,
342 const lxb_char_t *end)
343{
345
346 while (data != end) {
347 switch (*data) {
348 /* U+000D CARRIAGE RETURN (CR) */
349 case 0x0D:
350 if (++data >= end) {
352
354 tkz->state_return = lxb_html_tokenizer_state_plaintext;
355
356 return data;
357 }
358
360 tkz->pos[-1] = 0x0A;
361
363
364 if (*data != 0x0A) {
366 data--;
367 }
368
369 break;
370
371 /*
372 * U+0000 NULL
373 * EOF
374 */
375 case 0x00:
377
378 if (tkz->is_eof) {
379 if (tkz->token->begin != NULL) {
381 }
382
385
386 return end;
387 }
388
391
394 break;
395 }
396
397 data++;
398 }
399
401
402 return data;
403}
404
405/*
406 * 12.2.5.6 Tag open state
407 */
408static const lxb_char_t *
409lxb_html_tokenizer_state_tag_open(lxb_html_tokenizer_t *tkz,
410 const lxb_char_t *data, const lxb_char_t *end)
411{
412 /* ASCII alpha */
413 if (lexbor_str_res_alpha_character[ *data ] != LEXBOR_STR_RES_SLIP) {
414 tkz->state = lxb_html_tokenizer_state_tag_name;
415
418
419 return data;
420 }
421
422 /* U+002F SOLIDUS (/) */
423 else if (*data == 0x2F) {
424 tkz->state = lxb_html_tokenizer_state_end_tag_open;
425
426 return (data + 1);
427 }
428
429 /* U+0021 EXCLAMATION MARK (!) */
430 else if (*data == 0x21) {
431 tkz->state = lxb_html_tokenizer_state_markup_declaration_open;
432
434
435 return (data + 1);
436 }
437
438 /* U+003F QUESTION MARK (?) */
439 else if (*data == 0x3F) {
440 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
441
444
447
448 return data;
449 }
450
451 /* EOF */
452 else if (*data == 0x00) {
453 if (tkz->is_eof) {
455
458
461
462 return end;
463 }
464 }
465
467
470
471 tkz->state = lxb_html_tokenizer_state_data;
472
473 return data;
474}
475
476/*
477 * 12.2.5.7 End tag open state
478 */
479static const lxb_char_t *
480lxb_html_tokenizer_state_end_tag_open(lxb_html_tokenizer_t *tkz,
481 const lxb_char_t *data,
482 const lxb_char_t *end)
483{
484 /* ASCII alpha */
485 if (lexbor_str_res_alpha_character[ *data ] != LEXBOR_STR_RES_SLIP) {
486 tkz->state = lxb_html_tokenizer_state_tag_name;
487
490
492
493 return data;
494 }
495
496 /* U+003E GREATER-THAN SIGN (>) */
497 else if (*data == 0x3E) {
498 tkz->state = lxb_html_tokenizer_state_data;
499
502
503 return (data + 1);
504 }
505
506 /* Fake EOF */
507 else if (*data == 0x00) {
508 if (tkz->is_eof) {
510
513
516
517 return end;
518 }
519 }
520
521 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
522
525
528
529 return data;
530}
531
532/*
533 * 12.2.5.8 Tag name state
534 */
535static const lxb_char_t *
536lxb_html_tokenizer_state_tag_name(lxb_html_tokenizer_t *tkz,
537 const lxb_char_t *data, const lxb_char_t *end)
538{
540
541 while (data != end) {
542 switch (*data) {
543 /*
544 * U+0009 CHARACTER TABULATION (tab)
545 * U+000A LINE FEED (LF)
546 * U+000C FORM FEED (FF)
547 * U+000D CARRIAGE RETURN (CR)
548 * U+0020 SPACE
549 */
550 case 0x09:
551 case 0x0A:
552 case 0x0C:
553 case 0x0D:
554 case 0x20:
558
560 return (data + 1);
561
562 /* U+002F SOLIDUS (/) */
563 case 0x2F:
567
569 return (data + 1);
570
571 /* U+003E GREATER-THAN SIGN (>) */
572 case 0x3E:
574
579
580 return (data + 1);
581
582 /* U+0000 NULL */
583 case 0x00:
584 if (tkz->is_eof) {
586
588 tkz->token->end,
590 return end;
591 }
592
596
599 break;
600
601 default:
602 break;
603 }
604
605 data++;
606 }
607
609
610 return data;
611}
612
613/*
614 * 12.2.5.32 Before attribute name state
615 */
616const lxb_char_t *
618 const lxb_char_t *data,
619 const lxb_char_t *end)
620{
622
623 while (data != end) {
624 switch (*data) {
625 /*
626 * U+0009 CHARACTER TABULATION (tab)
627 * U+000A LINE FEED (LF)
628 * U+000C FORM FEED (FF)
629 * U+000D CARRIAGE RETURN (CR)
630 * U+0020 SPACE
631 */
632 case 0x09:
633 case 0x0A:
634 case 0x0C:
635 case 0x0D:
636 case 0x20:
637 break;
638
639 /* U+003D EQUALS SIGN (=) */
640 case 0x3D:
643
645
648
649 tkz->state = lxb_html_tokenizer_state_attribute_name;
650 return (data + 1);
651
652 /*
653 * U+002F SOLIDUS (/)
654 * U+003E GREATER-THAN SIGN (>)
655 */
656 case 0x2F:
657 case 0x3E:
658 tkz->state = lxb_html_tokenizer_state_after_attribute_name;
659 return data;
660
661 /* EOF */
662 case 0x00:
663 if (tkz->is_eof) {
664 tkz->state = lxb_html_tokenizer_state_after_attribute_name;
665 return data;
666 }
667 /* fall through */
668
669 /* Anything else */
670 default:
673
674 tkz->state = lxb_html_tokenizer_state_attribute_name;
675 return data;
676 }
677
678 data++;
679 }
680
681 return data;
682}
683
684/*
685 * 12.2.5.33 Attribute name state
686 */
687static const lxb_char_t *
688lxb_html_tokenizer_state_attribute_name(lxb_html_tokenizer_t *tkz,
689 const lxb_char_t *data,
690 const lxb_char_t *end)
691{
693
694 while (data != end) {
695 switch (*data) {
696 /*
697 * U+0009 CHARACTER TABULATION (tab)
698 * U+000A LINE FEED (LF)
699 * U+000C FORM FEED (FF)
700 * U+000D CARRIAGE RETURN (CR)
701 * U+0020 SPACE
702 * U+002F SOLIDUS (/)
703 * U+003E GREATER-THAN SIGN (>)
704 */
705 case 0x09:
706 case 0x0A:
707 case 0x0C:
708 case 0x0D:
709 case 0x20:
710 case 0x2F:
711 case 0x3E:
715
716 tkz->state = lxb_html_tokenizer_state_after_attribute_name;
717 return data;
718
719 /*
720 * U+0000 NULL
721 * EOF
722 */
723 case 0x00:
724 if (tkz->is_eof) {
726
727 tkz->state = lxb_html_tokenizer_state_after_attribute_name;
728 return data;
729 }
730
734
737 break;
738
739 /* U+003D EQUALS SIGN (=) */
740 case 0x3D:
744
745 tkz->state = lxb_html_tokenizer_state_before_attribute_value;
746 return (data + 1);
747
748 /*
749 * U+0022 QUOTATION MARK (")
750 * U+0027 APOSTROPHE (')
751 * U+003C LESS-THAN SIGN (<)
752 */
753 case 0x22:
754 case 0x27:
755 case 0x3C:
758 break;
759
760 default:
761 break;
762 }
763
764 data++;
765 }
766
768
769 return data;
770}
771
772/*
773 * 12.2.5.34 After attribute name state
774 */
775static const lxb_char_t *
776lxb_html_tokenizer_state_after_attribute_name(lxb_html_tokenizer_t *tkz,
777 const lxb_char_t *data,
778 const lxb_char_t *end)
779{
781
782 while (data != end) {
783 switch (*data) {
784 /*
785 * U+0009 CHARACTER TABULATION (tab)
786 * U+000A LINE FEED (LF)
787 * U+000C FORM FEED (FF)
788 * U+000D CARRIAGE RETURN (CR)
789 * U+0020 SPACE
790 */
791 case 0x09:
792 case 0x0A:
793 case 0x0C:
794 case 0x0D:
795 case 0x20:
796 break;
797
798 /* U+002F SOLIDUS (/) */
799 case 0x2F:
801 return (data + 1);
802
803 /* U+003D EQUALS SIGN (=) */
804 case 0x3D:
805 tkz->state = lxb_html_tokenizer_state_before_attribute_value;
806 return (data + 1);
807
808 /* U+003E GREATER-THAN SIGN (>) */
809 case 0x3E:
811
813
814 return (data + 1);
815
816 case 0x00:
817 if (tkz->is_eof) {
820 return end;
821 }
822 /* fall through */
823
824 default:
827
828 tkz->state = lxb_html_tokenizer_state_attribute_name;
829 return data;
830 }
831
832 data++;
833 }
834
835 return data;
836}
837
838/*
839 * 12.2.5.35 Before attribute value state
840 */
841static const lxb_char_t *
842lxb_html_tokenizer_state_before_attribute_value(lxb_html_tokenizer_t *tkz,
843 const lxb_char_t *data,
844 const lxb_char_t *end)
845{
846 while (data != end) {
847 switch (*data) {
848 /*
849 * U+0009 CHARACTER TABULATION (tab)
850 * U+000A LINE FEED (LF)
851 * U+000C FORM FEED (FF)
852 * U+000D CARRIAGE RETURN (CR)
853 * U+0020 SPACE
854 */
855 case 0x09:
856 case 0x0A:
857 case 0x0C:
858 case 0x0D:
859 case 0x20:
860 break;
861
862 /* U+0022 QUOTATION MARK (") */
863 case 0x22:
864 tkz->state =
865 lxb_html_tokenizer_state_attribute_value_double_quoted;
866
867 return (data + 1);
868
869 /* U+0027 APOSTROPHE (') */
870 case 0x27:
871 tkz->state =
872 lxb_html_tokenizer_state_attribute_value_single_quoted;
873
874 return (data + 1);
875
876 /* U+003E GREATER-THAN SIGN (>) */
877 case 0x3E:
879
882
884
885 return (data + 1);
886
887 default:
888 tkz->state = lxb_html_tokenizer_state_attribute_value_unquoted;
889 return data;
890 }
891
892 data++;
893 }
894
895 return data;
896}
897
898/*
899 * 12.2.5.36 Attribute value (double-quoted) state
900 */
901static const lxb_char_t *
902lxb_html_tokenizer_state_attribute_value_double_quoted(lxb_html_tokenizer_t *tkz,
903 const lxb_char_t *data,
904 const lxb_char_t *end)
905{
906 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
908 }
909
911
912 data = lexbor_swar_seek4(data, end, 0x22, 0x26, 0x0D, 0x00);
913
914 while (data != end) {
915 switch (*data) {
916 /* U+0022 QUOTATION MARK (") */
917 case 0x22:
921
922 tkz->state =
923 lxb_html_tokenizer_state_after_attribute_value_quoted;
924
925 return (data + 1);
926
927 /* U+0026 AMPERSAND (&) */
928 case 0x26:
930
931 tkz->state = lxb_html_tokenizer_state_char_ref_attr;
932 tkz->state_return = lxb_html_tokenizer_state_attribute_value_double_quoted;
933
934 return data + 1;
935
936 /* U+000D CARRIAGE RETURN (CR) */
937 case 0x0D:
938 if (++data >= end) {
940
942 tkz->state_return = lxb_html_tokenizer_state_attribute_value_double_quoted;
943
944 return data;
945 }
946
948 tkz->pos[-1] = 0x0A;
949
951
952 if (*data != 0x0A) {
954 data--;
955 }
956
957 break;
958
959 /*
960 * U+0000 NULL
961 * EOF
962 */
963 case 0x00:
964 if (tkz->is_eof) {
965 if (tkz->token->attr_last->value_begin != NULL) {
967 }
968
971 return end;
972 }
973
977
980 break;
981
982 default:
983 break;
984 }
985
986 data++;
987 }
988
990
991 return data;
992}
993
994/*
995 * 12.2.5.37 Attribute value (single-quoted) state
996 */
997static const lxb_char_t *
998lxb_html_tokenizer_state_attribute_value_single_quoted(lxb_html_tokenizer_t *tkz,
999 const lxb_char_t *data,
1000 const lxb_char_t *end)
1001{
1002 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
1004 }
1005
1007
1008 while (data != end) {
1009 switch (*data) {
1010 /* U+0027 APOSTROPHE (') */
1011 case 0x27:
1015
1016 tkz->state =
1017 lxb_html_tokenizer_state_after_attribute_value_quoted;
1018
1019 return (data + 1);
1020
1021 /* U+0026 AMPERSAND (&) */
1022 case 0x26:
1024
1025 tkz->state = lxb_html_tokenizer_state_char_ref_attr;
1026 tkz->state_return = lxb_html_tokenizer_state_attribute_value_single_quoted;
1027
1028 return data + 1;
1029
1030 /* U+000D CARRIAGE RETURN (CR) */
1031 case 0x0D:
1032 if (++data >= end) {
1034
1036 tkz->state_return = lxb_html_tokenizer_state_attribute_value_single_quoted;
1037
1038 return data;
1039 }
1040
1042 tkz->pos[-1] = 0x0A;
1043
1045
1046 if (*data != 0x0A) {
1048 data--;
1049 }
1050
1051 break;
1052
1053 /*
1054 * U+0000 NULL
1055 * EOF
1056 */
1057 case 0x00:
1058 if (tkz->is_eof) {
1059 if (tkz->token->attr_last->value_begin != NULL) {
1061 }
1062
1065 return end;
1066 }
1067
1071
1074 break;
1075
1076 default:
1077 break;
1078 }
1079
1080 data++;
1081 }
1082
1084
1085 return data;
1086}
1087
1088/*
1089 * 12.2.5.38 Attribute value (unquoted) state
1090 */
1091static const lxb_char_t *
1092lxb_html_tokenizer_state_attribute_value_unquoted(lxb_html_tokenizer_t *tkz,
1093 const lxb_char_t *data,
1094 const lxb_char_t *end)
1095{
1096 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
1098 }
1099
1101
1102 while (data != end) {
1103 switch (*data) {
1104 /*
1105 * U+0009 CHARACTER TABULATION (tab)
1106 * U+000A LINE FEED (LF)
1107 * U+000C FORM FEED (FF)
1108 * U+000D CARRIAGE RETURN (CR)
1109 * U+0020 SPACE
1110 */
1111 case 0x09:
1112 case 0x0A:
1113 case 0x0C:
1114 case 0x0D:
1115 case 0x20:
1119
1121 return (data + 1);
1122
1123 /* U+0026 AMPERSAND (&) */
1124 case 0x26:
1126
1127 tkz->state = lxb_html_tokenizer_state_char_ref_attr;
1128 tkz->state_return = lxb_html_tokenizer_state_attribute_value_unquoted;
1129
1130 return data + 1;
1131
1132 /* U+003E GREATER-THAN SIGN (>) */
1133 case 0x3E:
1135
1139
1141
1142 return (data + 1);
1143
1144 /*
1145 * U+0000 NULL
1146 * EOF
1147 */
1148 case 0x00:
1149 if (tkz->is_eof) {
1150 if (tkz->token->attr_last->value_begin != NULL) {
1152 }
1153
1156 return end;
1157 }
1158
1162
1165 break;
1166
1167 /*
1168 * U+0022 QUOTATION MARK (")
1169 * U+0027 APOSTROPHE (')
1170 * U+003C LESS-THAN SIGN (<)
1171 * U+003D EQUALS SIGN (=)
1172 * U+0060 GRAVE ACCENT (`)
1173 */
1174 case 0x22:
1175 case 0x27:
1176 case 0x3C:
1177 case 0x3D:
1178 case 0x60:
1181 break;
1182
1183 default:
1184 break;
1185 }
1186
1187 data++;
1188 }
1189
1191
1192 return data;
1193}
1194
1195/*
1196 * 12.2.5.39 After attribute value (quoted) state
1197 */
1198static const lxb_char_t *
1199lxb_html_tokenizer_state_after_attribute_value_quoted(lxb_html_tokenizer_t *tkz,
1200 const lxb_char_t *data,
1201 const lxb_char_t *end)
1202{
1203 switch (*data) {
1204 /*
1205 * U+0009 CHARACTER TABULATION (tab)
1206 * U+000A LINE FEED (LF)
1207 * U+000C FORM FEED (FF)
1208 * U+000D CARRIAGE RETURN (CR)
1209 * U+0020 SPACE
1210 */
1211 case 0x09:
1212 case 0x0A:
1213 case 0x0C:
1214 case 0x0D:
1215 case 0x20:
1217
1218 return (data + 1);
1219
1220 /* U+002F SOLIDUS (/) */
1221 case 0x2F:
1223
1224 return (data + 1);
1225
1226 /* U+003E GREATER-THAN SIGN (>) */
1227 case 0x3E:
1229
1231
1232 return (data + 1);
1233
1234 /* EOF */
1235 case 0x00:
1236 if (tkz->is_eof) {
1239 return end;
1240 }
1241 /* fall through */
1242
1243 default:
1246
1248
1249 return data;
1250 }
1251
1252 return data;
1253}
1254
1255
1256const lxb_char_t *
1258 const lxb_char_t *end)
1259{
1261
1262 if (*data == 0x0A) {
1263 data++;
1264 }
1265
1266 tkz->state = tkz->state_return;
1267
1268 return data;
1269}
1270
1271/*
1272 * 12.2.5.40 Self-closing start tag state
1273 */
1274const lxb_char_t *
1276 const lxb_char_t *data,
1277 const lxb_char_t *end)
1278{
1279 switch (*data) {
1280 /* U+003E GREATER-THAN SIGN (>) */
1281 case 0x3E:
1284
1286
1287 return (data + 1);
1288
1289 /* EOF */
1290 case 0x00:
1291 if (tkz->is_eof) {
1294 return end;
1295 }
1296 /* fall through */
1297
1298 default:
1301
1303
1304 return data;
1305 }
1306
1307 return data;
1308}
1309
1310/*
1311 * Helper function. No in the specification. For 12.2.5.41 Bogus comment state
1312 */
1313static const lxb_char_t *
1314lxb_html_tokenizer_state_bogus_comment_before(lxb_html_tokenizer_t *tkz,
1315 const lxb_char_t *data,
1316 const lxb_char_t *end)
1317{
1319
1320 tkz->state = lxb_html_tokenizer_state_bogus_comment;
1321
1322 return data;
1323}
1324
1325/*
1326 * 12.2.5.41 Bogus comment state
1327 */
1328static const lxb_char_t *
1329lxb_html_tokenizer_state_bogus_comment(lxb_html_tokenizer_t *tkz,
1330 const lxb_char_t *data,
1331 const lxb_char_t *end)
1332{
1334
1335 while (data != end) {
1336 switch (*data) {
1337 /* U+003E GREATER-THAN SIGN (>) */
1338 case 0x3E:
1340
1345
1346 return (data + 1);
1347
1348 /* U+000D CARRIAGE RETURN (CR) */
1349 case 0x0D:
1350 if (++data >= end) {
1352
1354 tkz->state_return = lxb_html_tokenizer_state_bogus_comment;
1355
1356 return data;
1357 }
1358
1360 tkz->pos[-1] = 0x0A;
1361
1363
1364 if (*data != 0x0A) {
1366 data--;
1367 }
1368
1369 break;
1370
1371 /*
1372 * EOF
1373 * U+0000 NULL
1374 */
1375 case 0x00:
1377
1378 if (tkz->is_eof) {
1379 if (tkz->token->begin != NULL) {
1381 }
1382
1385
1386 return end;
1387 }
1388
1391
1394 break;
1395 }
1396
1397 data++;
1398 }
1399
1401
1402 return data;
1403}
1404
1405/*
1406 * 12.2.5.42 Markup declaration open state
1407 */
1408static const lxb_char_t *
1409lxb_html_tokenizer_state_markup_declaration_open(lxb_html_tokenizer_t *tkz,
1410 const lxb_char_t *data,
1411 const lxb_char_t *end)
1412{
1413 /* Check first char for change parse state */
1414 if (tkz->is_eof == false) {
1416 }
1417
1418 /* U+002D HYPHEN-MINUS characters (-) */
1419 if (*data == 0x2D) {
1420 if ((end - data) < 2) {
1421 tkz->state = lxb_html_tokenizer_state_markup_declaration_comment;
1422 return (data + 1);
1423 }
1424
1425 if (data[1] == 0x2D) {
1427 return (data + 2);
1428 }
1429 }
1430 /*
1431 * ASCII case-insensitive match for the word "DOCTYPE"
1432 * U+0044 character (D) or U+0064 character (d)
1433 */
1434 else if (*data == 0x44 || *data == 0x64) {
1435 if ((end - data) < 7) {
1436 tkz->markup = (lxb_char_t *) "doctype";
1437
1438 tkz->state = lxb_html_tokenizer_state_markup_declaration_doctype;
1439 return data;
1440 }
1441
1442 if (lexbor_str_data_ncasecmp((lxb_char_t *) "doctype", data, 7)) {
1444 return (data + 7);
1445 }
1446 }
1447 /* Case-sensitive match for the string "[CDATA["
1448 * (the five uppercase letters "CDATA" with a U+005B LEFT SQUARE BRACKET
1449 * character before and after)
1450 */
1451 else if (*data == 0x5B) {
1452 if ((end - data) < 7) {
1453 tkz->markup = (lxb_char_t *) "[CDATA[";
1454
1455 tkz->state = lxb_html_tokenizer_state_markup_declaration_cdata;
1456 return data;
1457 }
1458
1459 if (lexbor_str_data_ncmp((lxb_char_t *) "[CDATA[", data, 7)) {
1461
1462 if (ns != LXB_NS_HTML && ns != LXB_NS__UNDEF) {
1463 data += 7;
1464
1466
1467 tkz->state = lxb_html_tokenizer_state_cdata_section_before;
1468
1469 return data;
1470 }
1471
1472 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1473
1474 return data;
1475 }
1476 }
1477
1478 if (tkz->is_eof) {
1480
1481 tkz->token->begin = tkz->token->end;
1482 }
1483
1486
1487 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1488
1489 return data;
1490}
1491
1492/*
1493 * Helper function. No in the specification. For 12.2.5.42
1494 * For a comment tag <!--
1495 */
1496static const lxb_char_t *
1497lxb_html_tokenizer_state_markup_declaration_comment(lxb_html_tokenizer_t *tkz,
1498 const lxb_char_t *data,
1499 const lxb_char_t *end)
1500{
1501 /* U+002D HYPHEN-MINUS characters (-) */
1502 if (*data == 0x2D) {
1504 return (data + 1);
1505 }
1506
1509
1510 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1511 return data;
1512}
1513
1514/*
1515 * Helper function. No in the specification. For 12.2.5.42
1516 * For a DOCTYPE tag <!DOCTYPE
1517 */
1518static const lxb_char_t *
1519lxb_html_tokenizer_state_markup_declaration_doctype(lxb_html_tokenizer_t *tkz,
1520 const lxb_char_t *data,
1521 const lxb_char_t *end)
1522{
1523 const lxb_char_t *pos;
1525
1526 if (pos == NULL) {
1529
1530 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1531 return data;
1532 }
1533
1534 if (*pos == '\0') {
1535 data = (data + (pos - tkz->markup));
1536
1538 return data;
1539 }
1540
1541 tkz->markup = pos;
1542
1543 return end;
1544}
1545
1546/*
1547 * Helper function. No in the specification. For 12.2.5.42
1548 * For a CDATA tag <![CDATA[
1549 */
1550static const lxb_char_t *
1551lxb_html_tokenizer_state_markup_declaration_cdata(lxb_html_tokenizer_t *tkz,
1552 const lxb_char_t *data,
1553 const lxb_char_t *end)
1554{
1555 const lxb_char_t *pos;
1557
1558 if (pos == NULL) {
1561
1562 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1563 return data;
1564 }
1565
1566 if (*pos == '\0') {
1568
1569 if (ns != LXB_NS_HTML && ns != LXB_NS__UNDEF) {
1570 data = (data + (pos - tkz->markup));
1571
1572 tkz->state = lxb_html_tokenizer_state_cdata_section_before;
1573 return data;
1574 }
1575
1576 lxb_html_tokenizer_state_append_m(tkz, "[CDATA", 6);
1577
1578 tkz->state = lxb_html_tokenizer_state_bogus_comment_before;
1579 return data;
1580 }
1581
1582 tkz->markup = pos;
1583
1584 return end;
1585}
1586
1587/*
1588 * Helper function. No in the specification. For 12.2.5.69
1589 */
1590static const lxb_char_t *
1591lxb_html_tokenizer_state_cdata_section_before(lxb_html_tokenizer_t *tkz,
1592 const lxb_char_t *data,
1593 const lxb_char_t *end)
1594{
1595 if (tkz->is_eof == false) {
1597 }
1598 else {
1600 }
1601
1602 tkz->token->tag_id = LXB_TAG__TEXT;
1603
1604 tkz->state = lxb_html_tokenizer_state_cdata_section;
1605
1606 return data;
1607}
1608
1609/*
1610 * 12.2.5.69 CDATA section state
1611 */
1612static const lxb_char_t *
1613lxb_html_tokenizer_state_cdata_section(lxb_html_tokenizer_t *tkz,
1614 const lxb_char_t *data,
1615 const lxb_char_t *end)
1616{
1618
1619 while (data != end) {
1620 switch (*data) {
1621 /* U+005D RIGHT SQUARE BRACKET (]) */
1622 case 0x5D:
1625
1626 tkz->state = lxb_html_tokenizer_state_cdata_section_bracket;
1627 return (data + 1);
1628
1629 /* U+000D CARRIAGE RETURN (CR) */
1630 case 0x0D:
1631 if (++data >= end) {
1633
1635 tkz->state_return = lxb_html_tokenizer_state_cdata_section;
1636
1637 return data;
1638 }
1639
1641 tkz->pos[-1] = 0x0A;
1642
1644
1645 if (*data != 0x0A) {
1647 data--;
1648 }
1649
1650 break;
1651
1652 /* EOF */
1653 case 0x00:
1654 if (tkz->is_eof) {
1657
1658 if (tkz->token->begin != NULL) {
1661 }
1662
1665
1666 return end;
1667 }
1668
1669 if (SIZE_MAX - tkz->token->null_count < 1) {
1671 return end;
1672 }
1673
1674 tkz->token->null_count++;
1675
1676 break;
1677
1678 default:
1679 break;
1680 }
1681
1682 data++;
1683 }
1684
1686
1687 return data;
1688}
1689
1690/*
1691 * 12.2.5.70 CDATA section bracket state
1692 */
1693static const lxb_char_t *
1694lxb_html_tokenizer_state_cdata_section_bracket(lxb_html_tokenizer_t *tkz,
1695 const lxb_char_t *data,
1696 const lxb_char_t *end)
1697{
1698 /* U+005D RIGHT SQUARE BRACKET (]) */
1699 if (*data == 0x5D) {
1700 tkz->state = lxb_html_tokenizer_state_cdata_section_end;
1701 return (data + 1);
1702 }
1703
1705
1706 tkz->state = lxb_html_tokenizer_state_cdata_section;
1707
1708 return data;
1709}
1710
1711/*
1712 * 12.2.5.71 CDATA section end state
1713 */
1714static const lxb_char_t *
1715lxb_html_tokenizer_state_cdata_section_end(lxb_html_tokenizer_t *tkz,
1716 const lxb_char_t *data,
1717 const lxb_char_t *end)
1718{
1719 /* U+005D RIGHT SQUARE BRACKET (]) */
1720 if (*data == 0x5D) {
1722 return (data + 1);
1723 }
1724 /* U+003E GREATER-THAN SIGN character */
1725 else if (*data == 0x3E) {
1727
1730
1731 return (data + 1);
1732 }
1733
1735
1736 tkz->state = lxb_html_tokenizer_state_cdata_section;
1737
1738 return data;
1739}
1740
1741/*
1742 * 12.2.5.72 Character reference state
1743 */
1744const lxb_char_t *
1746 const lxb_char_t *data, const lxb_char_t *end)
1747{
1748 tkz->is_attribute = false;
1749
1750 return _lxb_html_tokenizer_state_char_ref(tkz, data, end);
1751}
1752
1753static const lxb_char_t *
1754lxb_html_tokenizer_state_char_ref_attr(lxb_html_tokenizer_t *tkz,
1755 const lxb_char_t *data,
1756 const lxb_char_t *end)
1757{
1758 tkz->is_attribute = true;
1759
1760 return _lxb_html_tokenizer_state_char_ref(tkz, data, end);
1761}
1762
1763static const lxb_char_t *
1764_lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz,
1765 const lxb_char_t *data,
1766 const lxb_char_t *end)
1767{
1768 /* ASCII alphanumeric */
1769 if (lexbor_str_res_alphanumeric_character[ *data ] != LEXBOR_STR_RES_SLIP) {
1770 tkz->entity = &lxb_html_tokenizer_res_entities_sbst[1];
1771 tkz->entity_match = NULL;
1772 tkz->entity_start = (tkz->pos - 1) - tkz->start;
1773
1774 tkz->state = lxb_html_tokenizer_state_char_ref_named;
1775
1776 return data;
1777 }
1778 /* U+0023 NUMBER SIGN (#) */
1779 else if (*data == 0x23) {
1780 tkz->markup = data;
1781 tkz->entity_start = (tkz->pos - 1) - tkz->start;
1782
1784
1785 tkz->state = lxb_html_tokenizer_state_char_ref_numeric;
1786
1787 return (data + 1);
1788 }
1789 else {
1790 tkz->state = tkz->state_return;
1791 }
1792
1793 return data;
1794}
1795
1796/*
1797 * 12.2.5.73 Named character reference state
1798 *
1799 * The slowest part in HTML parsing!!!
1800 *
1801 * This option works correctly and passes all tests (stream parsing too).
1802 * We must seriously think about how to accelerate this part.
1803 */
1804static const lxb_char_t *
1805lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t *tkz,
1806 const lxb_char_t *data,
1807 const lxb_char_t *end)
1808{
1809 size_t size, tail_size;
1811 const lexbor_sbst_entry_static_t *entry = tkz->entity;
1812
1813 const lxb_char_t *begin = data;
1814
1815 while (data < end) {
1816 entry = lexbor_sbst_entry_static_find(lxb_html_tokenizer_res_entities_sbst,
1817 entry, *data);
1818 if (entry == NULL) {
1820 goto done;
1821 }
1822
1823 if (entry->value[0] != 0) {
1824 tkz->entity_end = (tkz->pos + (data - begin)) - tkz->start;
1825 tkz->entity_match = entry;
1826 }
1827
1828 entry = &lxb_html_tokenizer_res_entities_sbst[ entry->next ];
1829
1830 data++;
1831 }
1832
1833 /* If entry not NULL and buffer empty, then wait next buffer. */
1834 tkz->entity = entry;
1835
1837 return data;
1838
1839done:
1840
1841 /* If we have bad entity */
1842 if (tkz->entity_match == NULL) {
1843 tkz->state = lxb_html_tokenizer_state_char_ref_ambiguous_ampersand;
1844
1845 return data;
1846 }
1847
1848 tkz->state = tkz->state_return;
1849
1850 /*
1851 * If the character reference was consumed as part of an attribute,
1852 * and the last character matched is not a U+003B SEMICOLON character (;),
1853 * and the next input character is either a U+003D EQUALS SIGN character (=)
1854 * or an ASCII alphanumeric, then, for historical reasons,
1855 * flush code points consumed as a character reference
1856 * and switch to the return state.
1857 */
1858 /* U+003B SEMICOLON character (;) */
1859 if (tkz->is_attribute && tkz->entity_match->key != 0x3B) {
1860 /* U+003D EQUALS SIGN character (=) or ASCII alphanumeric */
1861 if (*data == 0x3D
1862 || lexbor_str_res_alphanumeric_character[*data] != LEXBOR_STR_RES_SLIP)
1863 {
1864 return data;
1865 }
1866 }
1867
1868 if (tkz->entity_match->key != 0x3B) {
1871 }
1872
1873 start = &tkz->start[tkz->entity_start];
1874
1875 size = tkz->pos - start;
1876 tail_size = tkz->pos - &tkz->start[tkz->entity_end] - 1;
1877
1878 if (tail_size != 0) {
1879 if ((size + tail_size) + start > tkz->end) {
1881 return end;
1882 }
1883 start = &tkz->start[tkz->entity_start];
1884 }
1885
1887 tkz->pos - tail_size, tail_size);
1888 }
1889
1891
1892 tkz->pos = start + (tkz->entity_match->value_len + tail_size);
1893
1894 return data;
1895}
1896
1897/*
1898 * 12.2.5.74 Ambiguous ampersand state
1899 */
1900static const lxb_char_t *
1901lxb_html_tokenizer_state_char_ref_ambiguous_ampersand(lxb_html_tokenizer_t *tkz,
1902 const lxb_char_t *data,
1903 const lxb_char_t *end)
1904{
1905 /* ASCII alphanumeric */
1906 /* Skipped, not need */
1907
1908 /* U+003B SEMICOLON (;) */
1909 if (*data == 0x3B) {
1912 }
1913
1914 tkz->state = tkz->state_return;
1915
1916 return data;
1917}
1918
1919/*
1920 * 12.2.5.75 Numeric character reference state
1921 */
1922static const lxb_char_t *
1923lxb_html_tokenizer_state_char_ref_numeric(lxb_html_tokenizer_t *tkz,
1924 const lxb_char_t *data,
1925 const lxb_char_t *end)
1926{
1927 tkz->entity_number = 0;
1928
1929 /*
1930 * U+0078 LATIN SMALL LETTER X
1931 * U+0058 LATIN CAPITAL LETTER X
1932 */
1933 if (*data == 0x78 || *data == 0x58) {
1935
1936 tkz->state = lxb_html_tokenizer_state_char_ref_hexademical_start;
1937
1938 return (data + 1);
1939 }
1940
1941 tkz->state = lxb_html_tokenizer_state_char_ref_decimal_start;
1942
1943 return data;
1944}
1945
1946/*
1947 * 12.2.5.76 Hexademical character reference start state
1948 */
1949static const lxb_char_t *
1950lxb_html_tokenizer_state_char_ref_hexademical_start(lxb_html_tokenizer_t *tkz,
1951 const lxb_char_t *data,
1952 const lxb_char_t *end)
1953{
1954 /* ASCII hex digit */
1955 if (lexbor_str_res_map_hex[ *data ] != LEXBOR_STR_RES_SLIP) {
1956 tkz->state = lxb_html_tokenizer_state_char_ref_hexademical;
1957 }
1958 else {
1961
1962 tkz->state = tkz->state_return;
1963 }
1964
1965 return data;
1966}
1967
1968/*
1969 * 12.2.5.77 Decimal character reference start state
1970 */
1971static const lxb_char_t *
1972lxb_html_tokenizer_state_char_ref_decimal_start(lxb_html_tokenizer_t *tkz,
1973 const lxb_char_t *data,
1974 const lxb_char_t *end)
1975{
1976 /* ASCII digit */
1977 if (lexbor_str_res_map_num[ *data ] != LEXBOR_STR_RES_SLIP) {
1978 tkz->state = lxb_html_tokenizer_state_char_ref_decimal;
1979 }
1980 else {
1983
1984 tkz->state = tkz->state_return;
1985 }
1986
1987 return data;
1988}
1989
1990/*
1991 * 12.2.5.78 Hexademical character reference state
1992 */
1993static const lxb_char_t *
1994lxb_html_tokenizer_state_char_ref_hexademical(lxb_html_tokenizer_t *tkz,
1995 const lxb_char_t *data,
1996 const lxb_char_t *end)
1997{
1998 while (data != end) {
1999 if (lexbor_str_res_map_hex[ *data ] == LEXBOR_STR_RES_SLIP) {
2000 tkz->state = tkz->state_return;
2001
2002 if (*data == ';') {
2003 data++;
2004 }
2005
2006 return lxb_html_tokenizer_state_char_ref_numeric_end(tkz, data, end);
2007 }
2008
2009 if (tkz->entity_number <= 0x10FFFF) {
2010 tkz->entity_number <<= 4;
2011 tkz->entity_number |= lexbor_str_res_map_hex[ *data ];
2012 }
2013
2014 data++;
2015 }
2016
2017 return data;
2018}
2019
2020/*
2021 * 12.2.5.79 Decimal character reference state
2022 */
2023static const lxb_char_t *
2024lxb_html_tokenizer_state_char_ref_decimal(lxb_html_tokenizer_t *tkz,
2025 const lxb_char_t *data,
2026 const lxb_char_t *end)
2027{
2028 while (data != end) {
2029 if (lexbor_str_res_map_num[ *data ] == LEXBOR_STR_RES_SLIP) {
2030 tkz->state = tkz->state_return;
2031
2032 if (*data == ';') {
2033 data++;
2034 }
2035
2036 return lxb_html_tokenizer_state_char_ref_numeric_end(tkz, data, end);
2037 }
2038
2039 if (tkz->entity_number <= 0x10FFFF) {
2040 tkz->entity_number = lexbor_str_res_map_num[ *data ]
2041 + tkz->entity_number * 10;
2042 }
2043
2044 data++;
2045 }
2046
2047 return data;
2048}
2049
2050/*
2051 * 12.2.5.80 Numeric character reference end state
2052 */
2053static const lxb_char_t *
2054lxb_html_tokenizer_state_char_ref_numeric_end(lxb_html_tokenizer_t *tkz,
2055 const lxb_char_t *data,
2056 const lxb_char_t *end)
2057{
2058 lxb_char_t *start = &tkz->start[tkz->entity_start];
2059
2060 if ((start + 4) > tkz->end) {
2062 return end;
2063 }
2064
2065 start = &tkz->start[tkz->entity_start];
2066 }
2067
2068 if (tkz->entity_number == 0x00) {
2071
2072 goto xFFFD;
2073 }
2074 else if (tkz->entity_number > 0x10FFFF) {
2077
2078 goto xFFFD;
2079 }
2080 else if (tkz->entity_number >= 0xD800 && tkz->entity_number <= 0xDFFF) {
2083
2084 goto xFFFD;
2085 }
2086 else if (tkz->entity_number >= 0xFDD0 && tkz->entity_number <= 0xFDEF) {
2089 }
2090
2091 switch (tkz->entity_number) {
2092 case 0xFFFE: case 0xFFFF: case 0x1FFFE: case 0x1FFFF: case 0x2FFFE:
2093 case 0x2FFFF: case 0x3FFFE: case 0x3FFFF: case 0x4FFFE: case 0x4FFFF:
2094 case 0x5FFFE: case 0x5FFFF: case 0x6FFFE: case 0x6FFFF: case 0x7FFFE:
2095 case 0x7FFFF: case 0x8FFFE: case 0x8FFFF: case 0x9FFFE: case 0x9FFFF:
2096 case 0xAFFFE: case 0xAFFFF: case 0xBFFFE: case 0xBFFFF: case 0xCFFFE:
2097 case 0xCFFFF: case 0xDFFFE: case 0xDFFFF: case 0xEFFFE: case 0xEFFFF:
2098 case 0xFFFFE: case 0xFFFFF:
2099 case 0x10FFFE:
2100 case 0x10FFFF:
2103 break;
2104
2105 default:
2106 break;
2107 }
2108
2109 if ((tkz->entity_number <= 0x1F
2110 && tkz->entity_number != 0x09 /* TAB */
2111 && tkz->entity_number != 0x0A /* LINE FEED (LF) */
2112 && tkz->entity_number != 0x0C /* FORM FEED (FF) */
2113 && tkz->entity_number != 0x20) /* SPACE */
2114 || (tkz->entity_number >= 0x7F && tkz->entity_number <= 0x9F))
2115 {
2118 }
2119
2120 if (tkz->entity_number <= 0x9F) {
2121 tkz->entity_number = (uint32_t) lexbor_str_res_replacement_character[tkz->entity_number];
2122 }
2123
2124 start += lxb_html_tokenizer_state_to_ascii_utf_8(tkz->entity_number, start);
2125
2126 tkz->pos = start;
2127
2128 return data;
2129
2130xFFFD:
2131
2132 memcpy(start, lexbor_str_res_ansi_replacement_character,
2133 sizeof(lexbor_str_res_ansi_replacement_character) - 1);
2134
2135 tkz->pos = start + sizeof(lexbor_str_res_ansi_replacement_character) - 1;
2136
2137 return data;
2138}
2139
2140static size_t
2141lxb_html_tokenizer_state_to_ascii_utf_8(size_t codepoint, lxb_char_t *data)
2142{
2143 /* 0x80 -- 10xxxxxx */
2144 /* 0xC0 -- 110xxxxx */
2145 /* 0xE0 -- 1110xxxx */
2146 /* 0xF0 -- 11110xxx */
2147
2148 if (codepoint <= 0x0000007F) {
2149 /* 0xxxxxxx */
2150 data[0] = (char) codepoint;
2151
2152 return 1;
2153 }
2154 else if (codepoint <= 0x000007FF) {
2155 /* 110xxxxx 10xxxxxx */
2156 data[0] = (char) (0xC0 | (codepoint >> 6 ));
2157 data[1] = (char) (0x80 | (codepoint & 0x3F));
2158
2159 return 2;
2160 }
2161 else if (codepoint <= 0x0000FFFF) {
2162 /* 1110xxxx 10xxxxxx 10xxxxxx */
2163 data[0] = (char) (0xE0 | ((codepoint >> 12)));
2164 data[1] = (char) (0x80 | ((codepoint >> 6 ) & 0x3F));
2165 data[2] = (char) (0x80 | ( codepoint & 0x3F));
2166
2167 return 3;
2168 }
2169 else if (codepoint <= 0x001FFFFF) {
2170 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
2171 data[0] = (char) (0xF0 | ( codepoint >> 18));
2172 data[1] = (char) (0x80 | ((codepoint >> 12) & 0x3F));
2173 data[2] = (char) (0x80 | ((codepoint >> 6 ) & 0x3F));
2174 data[3] = (char) (0x80 | ( codepoint & 0x3F));
2175
2176 return 4;
2177 }
2178
2179 return 0;
2180}
@ LXB_STATUS_OK
Definition base.h:49
@ LXB_STATUS_ERROR_OVERFLOW
Definition base.h:62
lxb_ns_id_t lxb_html_tokenizer_current_namespace(lxb_html_tokenizer_t *tkz)
Definition tokenizer.c:389
int begin
Definition eaw_table.h:20
new_type size
Definition ffi.c:4365
memcpy(ptr1, ptr2, size)
new_type attr
Definition ffi.c:4364
buf start
Definition ffi.c:4687
#define SIZE_MAX
Definition funcs.c:51
#define NULL
Definition gdcache.h:45
struct lexbor_hash lexbor_hash_t
Definition hash.h:41
hash(string $algo, string $data, bool $binary=false, array $options=[])
Definition hash.stub.php:12
struct lxb_html_tokenizer lxb_html_tokenizer_t
Definition base.h:26
@ LXB_HTML_TOKEN_TYPE_CLOSE
Definition token.h:27
@ LXB_HTML_TOKEN_TYPE_CLOSE_SELF
Definition token.h:28
lxb_html_tokenizer_error_t * lxb_html_tokenizer_error_add(lexbor_array_obj_t *parse_errors, const lxb_char_t *pos, lxb_html_tokenizer_error_id_t id)
Definition error.c:11
@ LXB_HTML_TOKENIZER_ERROR_UNCHINUNATVA
Definition error.h:108
@ LXB_HTML_TOKENIZER_ERROR_MIATVA
Definition error.h:64
@ LXB_HTML_TOKENIZER_ERROR_UNNACHRE
Definition error.h:118
@ LXB_HTML_TOKENIZER_ERROR_UNEQSIBEATNA
Definition error.h:110
@ LXB_HTML_TOKENIZER_ERROR_SUCHRE
Definition error.h:100
@ LXB_HTML_TOKENIZER_ERROR_COCHRE
Definition error.h:36
@ LXB_HTML_TOKENIZER_ERROR_MISEAFCHRE
Definition error.h:78
@ LXB_HTML_TOKENIZER_ERROR_UNCHINATNA
Definition error.h:106
@ LXB_HTML_TOKENIZER_ERROR_EOINTA
Definition error.h:54
@ LXB_HTML_TOKENIZER_ERROR_UNNUCH
Definition error.h:112
@ LXB_HTML_TOKENIZER_ERROR_INFICHOFTANA
Definition error.h:62
@ LXB_HTML_TOKENIZER_ERROR_ABOFDIINNUCHRE
Definition error.h:28
@ LXB_HTML_TOKENIZER_ERROR_INOPCO
Definition error.h:58
@ LXB_HTML_TOKENIZER_ERROR_MIENTANA
Definition error.h:72
@ LXB_HTML_TOKENIZER_ERROR_EOBETANA
Definition error.h:44
@ LXB_HTML_TOKENIZER_ERROR_CHREOUUNRA
Definition error.h:32
@ LXB_HTML_TOKENIZER_ERROR_MIWHBEAT
Definition error.h:86
@ LXB_HTML_TOKENIZER_ERROR_EOINCD
Definition error.h:46
@ LXB_HTML_TOKENIZER_ERROR_UNSOINTA
Definition error.h:116
@ LXB_HTML_TOKENIZER_ERROR_NOCHRE
Definition error.h:92
@ LXB_HTML_TOKENIZER_ERROR_UNQUMAINOFTANA
Definition error.h:114
@ LXB_HTML_TOKENIZER_ERROR_NUCHRE
Definition error.h:98
const lxb_char_t * lxb_html_tokenizer_state_before_attribute_name(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
Definition state.c:617
const lxb_char_t * lxb_html_tokenizer_state_plaintext_before(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
Definition state.c:321
lxb_dom_attr_data_t * lxb_dom_attr_local_name_append(lexbor_hash_t *hash, const lxb_char_t *name, size_t length)
Definition attr.c:358
const lxb_char_t * lxb_html_tokenizer_state_char_ref(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
Definition state.c:1745
const lxb_char_t * lxb_html_tokenizer_state_data_before(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
Definition state.c:204
const lxb_char_t * lxb_html_tokenizer_state_cr(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
Definition state.c:1257
const lxb_tag_data_t * lxb_tag_append_lower(lexbor_hash_t *hash, const lxb_char_t *name, size_t length)
Definition tag.c:41
const lxb_char_t * lxb_html_tokenizer_state_self_closing_start_tag(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
Definition state.c:1275
#define lxb_html_tokenizer_state_token_attr_set_value_end(tkz, v_end)
Definition state.h:141
#define lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, v_return)
Definition state.h:111
#define lxb_html_tokenizer_state_token_attr_set_name_end_oef(tkz)
Definition state.h:131
#define lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz)
Definition state.h:144
#define lxb_html_tokenizer_state_append_data_m(tkz, v_data)
Definition state.h:19
#define lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, v_begin)
Definition state.h:121
#define lxb_html_tokenizer_state_set_text(tkz)
Definition state.h:174
#define lxb_html_tokenizer_state_token_set_begin(tkz, v_begin)
Definition state.h:89
#define lxb_html_tokenizer_state_append_replace_m(tkz)
Definition state.h:37
#define lxb_html_tokenizer_state_set_name_m(tkz)
Definition state.h:60
#define lxb_html_tokenizer_state_token_done_m(tkz, v_end)
Definition state.h:157
#define lxb_html_tokenizer_state_token_attr_set_name_end(tkz, v_end)
Definition state.h:128
#define lxb_html_tokenizer_state_set_tag_m(tkz, _start, _end)
Definition state.h:48
#define lxb_html_tokenizer_state_token_set_end(tkz, v_end)
Definition state.h:98
#define lxb_html_tokenizer_state_token_set_end_oef(tkz)
Definition state.h:108
#define lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, v_begin)
Definition state.h:134
#define lxb_html_tokenizer_state_token_done_wo_check_m(tkz, v_end)
Definition state.h:167
#define lxb_html_tokenizer_state_token_emit_text_not_empty_m(tkz, v_end)
Definition state.h:181
#define lxb_html_tokenizer_state_begin_set(tkz, v_data)
Definition state.h:16
#define lxb_html_tokenizer_state_set_value_m(tkz)
Definition state.h:73
#define lxb_html_tokenizer_state_append_m(tkz, v_data, size)
Definition state.h:27
lxb_inline lxb_status_t lxb_html_tokenizer_temp_realloc(lxb_html_tokenizer_t *tkz, size_t size)
Definition tokenizer.h:254
uintptr_t lxb_ns_id_t
Definition const.h:20
@ LXB_NS__UNDEF
Definition const.h:24
@ LXB_NS_HTML
Definition const.h:26
#define memmove(a, b, c)
unsigned const char * end
Definition php_ffi.h:51
unsigned const char * pos
Definition php_ffi.h:52
zend_constant * data
lxb_inline const lexbor_sbst_entry_static_t * lexbor_sbst_entry_static_find(const lexbor_sbst_entry_static_t *strt, const lexbor_sbst_entry_static_t *root, const lxb_char_t key)
Definition sbst.h:36
const lxb_char_t * lxb_html_tokenizer_state_comment_before_start(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
const lxb_char_t * lxb_html_tokenizer_state_doctype_before(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
bool lexbor_str_data_ncasecmp(const lxb_char_t *first, const lxb_char_t *sec, size_t size)
Definition str.c:435
const lxb_char_t * lexbor_str_data_ncasecmp_first(const lxb_char_t *first, const lxb_char_t *sec, size_t sec_size)
Definition str.c:384
bool lexbor_str_data_ncmp(const lxb_char_t *first, const lxb_char_t *sec, size_t size)
Definition str.c:523
#define LEXBOR_STR_RES_SLIP
Definition str_res.h:14
Definition sbst.h:19
lxb_char_t key
Definition sbst.h:20
unsigned short next
Definition sbst.h:27
lxb_char_t value[6]
Definition sbst.h:22
unsigned char value_len
Definition sbst.h:23
const lxb_char_t * value_begin
Definition token_attr.h:35
const lxb_char_t * end
Definition token.h:35
lxb_html_token_type_t type
Definition token.h:49
size_t null_count
Definition token.h:47
lxb_tag_id_t tag_id
Definition token.h:48
lxb_html_token_attr_t * attr_last
Definition token.h:43
const lxb_char_t * begin
Definition token.h:34
lxb_html_token_t * token
Definition tokenizer.h:49
const lxb_char_t * markup
Definition tokenizer.h:67
uintptr_t entity_start
Definition tokenizer.h:82
lxb_char_t * pos
Definition tokenizer.h:72
lxb_html_tokenizer_state_f state_return
Definition tokenizer.h:36
const lexbor_sbst_entry_static_t * entity
Definition tokenizer.h:80
lxb_html_tokenizer_state_f state
Definition tokenizer.h:35
uint32_t entity_number
Definition tokenizer.h:85
lexbor_array_obj_t * parse_errors
Definition tokenizer.h:56
uintptr_t entity_end
Definition tokenizer.h:83
lxb_status_t status
Definition tokenizer.h:90
lxb_char_t * start
Definition tokenizer.h:71
const lxb_char_t * end
Definition tokenizer.h:73
const lexbor_sbst_entry_static_t * entity_match
Definition tokenizer.h:81
const lxb_char_t * last
Definition tokenizer.h:75
lxb_inline const lxb_char_t * lexbor_swar_seek4(const lxb_char_t *data, const lxb_char_t *end, lxb_char_t c1, lxb_char_t c2, lxb_char_t c3, lxb_char_t c4)
Definition swar.h:33
@ LXB_TAG__EM_COMMENT
Definition const.h:28
@ LXB_TAG__TEXT
Definition const.h:26
struct lxb_html_token_attr lxb_html_token_attr_t
Definition token_attr.h:22
unsigned char lxb_char_t
Definition types.h:27
zend_string * name