php-internal-docs 8.4.8
Unofficial docs for php/php-src
Loading...
Searching...
No Matches
state_doctype.c
Go to the documentation of this file.
1/*
2 * Copyright (C) 2018-2020 Alexander Borisov
3 *
4 * Author: Alexander Borisov <borisov@lexbor.com>
5 */
6
9
10
11#define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER
12#include "lexbor/core/str_res.h"
13
14
17 const lxb_char_t *name, size_t length);
18
19
20static const lxb_char_t *
21lxb_html_tokenizer_state_doctype(lxb_html_tokenizer_t *tkz,
22 const lxb_char_t *data,
23 const lxb_char_t *end);
24
25static const lxb_char_t *
26lxb_html_tokenizer_state_doctype_before_name(lxb_html_tokenizer_t *tkz,
27 const lxb_char_t *data,
28 const lxb_char_t *end);
29
30static const lxb_char_t *
31lxb_html_tokenizer_state_doctype_name(lxb_html_tokenizer_t *tkz,
32 const lxb_char_t *data,
33 const lxb_char_t *end);
34
35static const lxb_char_t *
36lxb_html_tokenizer_state_doctype_after_name(lxb_html_tokenizer_t *tkz,
37 const lxb_char_t *data,
38 const lxb_char_t *end);
39
40static const lxb_char_t *
41lxb_html_tokenizer_state_doctype_after_name_public(lxb_html_tokenizer_t *tkz,
42 const lxb_char_t *data,
43 const lxb_char_t *end);
44
45static const lxb_char_t *
46lxb_html_tokenizer_state_doctype_after_name_system(lxb_html_tokenizer_t *tkz,
47 const lxb_char_t *data,
48 const lxb_char_t *end);
49
50static const lxb_char_t *
51lxb_html_tokenizer_state_doctype_after_public_keyword(lxb_html_tokenizer_t *tkz,
52 const lxb_char_t *data,
53 const lxb_char_t *end);
54
55static const lxb_char_t *
56lxb_html_tokenizer_state_doctype_before_public_identifier(
58 const lxb_char_t *data,
59 const lxb_char_t *end);
60
61static const lxb_char_t *
62lxb_html_tokenizer_state_doctype_public_identifier_double_quoted(
64 const lxb_char_t *data,
65 const lxb_char_t *end);
66
67static const lxb_char_t *
68lxb_html_tokenizer_state_doctype_public_identifier_single_quoted(
70 const lxb_char_t *data,
71 const lxb_char_t *end);
72
73static const lxb_char_t *
74lxb_html_tokenizer_state_doctype_after_public_identifier(
76 const lxb_char_t *data,
77 const lxb_char_t *end);
78
79static const lxb_char_t *
80lxb_html_tokenizer_state_doctype_between_public_and_system_identifiers(
82 const lxb_char_t *data,
83 const lxb_char_t *end);
84
85static const lxb_char_t *
86lxb_html_tokenizer_state_doctype_after_system_keyword(lxb_html_tokenizer_t *tkz,
87 const lxb_char_t *data,
88 const lxb_char_t *end);
89
90static const lxb_char_t *
91lxb_html_tokenizer_state_doctype_before_system_identifier(
93 const lxb_char_t *data,
94 const lxb_char_t *end);
95
96static const lxb_char_t *
97lxb_html_tokenizer_state_doctype_system_identifier_double_quoted(
99 const lxb_char_t *data,
100 const lxb_char_t *end);
101
102static const lxb_char_t *
103lxb_html_tokenizer_state_doctype_system_identifier_single_quoted(
105 const lxb_char_t *data,
106 const lxb_char_t *end);
107
108static const lxb_char_t *
109lxb_html_tokenizer_state_doctype_after_system_identifier(
111 const lxb_char_t *data,
112 const lxb_char_t *end);
113
114static const lxb_char_t *
115lxb_html_tokenizer_state_doctype_bogus(lxb_html_tokenizer_t *tkz,
116 const lxb_char_t *data,
117 const lxb_char_t *end);
118
119
120/*
121 * Helper function. No in the specification. For 12.2.5.53
122 */
123const lxb_char_t *
125 const lxb_char_t *data,
126 const lxb_char_t *end)
127{
128 if (tkz->is_eof == false) {
130 }
131 else {
133 }
134
136
137 return lxb_html_tokenizer_state_doctype(tkz, data, end);
138}
139
140/*
141 * 12.2.5.53 DOCTYPE state
142 */
143static const lxb_char_t *
144lxb_html_tokenizer_state_doctype(lxb_html_tokenizer_t *tkz,
145 const lxb_char_t *data,
146 const lxb_char_t *end)
147{
148 switch (*data) {
149 /*
150 * U+0009 CHARACTER TABULATION (tab)
151 * U+000A LINE FEED (LF)
152 * U+000C FORM FEED (FF)
153 * U+000D CARRIAGE RETURN (CR)
154 * U+0020 SPACE
155 */
156 case 0x09:
157 case 0x0A:
158 case 0x0C:
159 case 0x0D:
160 case 0x20:
161 data++;
162 break;
163
164 /* U+003E GREATER-THAN SIGN (>) */
165 case 0x3E:
166 break;
167
168 /* EOF */
169 case 0x00:
170 if (tkz->is_eof) {
173
175
177
178 return end;
179 }
180 /* fall through */
181
182 default:
185 break;
186 }
187
188 tkz->state = lxb_html_tokenizer_state_doctype_before_name;
189
190 return data;
191}
192
193/*
194 * 12.2.5.54 Before DOCTYPE name state
195 */
196static const lxb_char_t *
197lxb_html_tokenizer_state_doctype_before_name(lxb_html_tokenizer_t *tkz,
198 const lxb_char_t *data,
199 const lxb_char_t *end)
200{
202
203 while (data != end) {
204 switch (*data) {
205 /*
206 * U+0009 CHARACTER TABULATION (tab)
207 * U+000A LINE FEED (LF)
208 * U+000C FORM FEED (FF)
209 * U+000D CARRIAGE RETURN (CR)
210 * U+0020 SPACE
211 */
212 case 0x09:
213 case 0x0A:
214 case 0x0C:
215 case 0x0D:
216 case 0x20:
217 break;
218
219 /*
220 * U+0000 NULL
221 * EOF
222 */
223 case 0x00:
224 if (tkz->is_eof) {
227
229
231
232 return end;
233 }
234
238
241
242 tkz->token->attr_last->type
244
245 tkz->state = lxb_html_tokenizer_state_doctype_name;
246
247 return (data + 1);
248
249 /* U+003E GREATER-THAN SIGN (>) */
250 case 0x3E:
253
256
258
259 return (data + 1);
260
261 /*
262 * ASCII upper alpha
263 * Anything else
264 */
265 default:
268
269 tkz->state = lxb_html_tokenizer_state_doctype_name;
270
271 return data;
272 }
273
274 data++;
275 }
276
277 return data;
278}
279
280/*
281 * 12.2.5.55 DOCTYPE name state
282 */
283static const lxb_char_t *
284lxb_html_tokenizer_state_doctype_name(lxb_html_tokenizer_t *tkz,
285 const lxb_char_t *data,
286 const lxb_char_t *end)
287{
289
290 while (data != end) {
291 switch (*data) {
292 /*
293 * U+0009 CHARACTER TABULATION (tab)
294 * U+000A LINE FEED (LF)
295 * U+000C FORM FEED (FF)
296 * U+000D CARRIAGE RETURN (CR)
297 * U+0020 SPACE
298 */
299 case 0x09:
300 case 0x0A:
301 case 0x0C:
302 case 0x0D:
303 case 0x20:
307
308 tkz->state = lxb_html_tokenizer_state_doctype_after_name;
309
310 return (data + 1);
311
312 /* U+003E GREATER-THAN SIGN (>) */
313 case 0x3E:
315
320
321 return (data + 1);
322
323 /*
324 * U+0000 NULL
325 * EOF
326 */
327 case 0x00:
329
330 if (tkz->is_eof) {
332
334 tkz->token->attr_last->name_end,
336
338
341
342 return end;
343 }
344
347
350
351 tkz->token->attr_last->type
353
354 break;
355
356 /* Anything else */
357 default:
358 break;
359 }
360
361 data++;
362 }
363
365
366 return data;
367}
368
369/*
370 * 12.2.5.56 After DOCTYPE name state
371 */
372static const lxb_char_t *
373lxb_html_tokenizer_state_doctype_after_name(lxb_html_tokenizer_t *tkz,
374 const lxb_char_t *data,
375 const lxb_char_t *end)
376{
378 const lxb_dom_attr_data_t *attr_data;
379
380 while (data != end) {
381 switch (*data) {
382 /*
383 * U+0009 CHARACTER TABULATION (tab)
384 * U+000A LINE FEED (LF)
385 * U+000C FORM FEED (FF)
386 * U+000D CARRIAGE RETURN (CR)
387 * U+0020 SPACE
388 */
389 case 0x09:
390 case 0x0A:
391 case 0x0C:
392 case 0x0D:
393 case 0x20:
394 break;
395
396 /* U+003E GREATER-THAN SIGN (>) */
397 case 0x3E:
399
401
402 return (data + 1);
403
404 /* EOF */
405 case 0x00:
406 if (tkz->is_eof) {
409
411
413
414 return end;
415 }
416 /* fall through */
417
418 /* Anything else */
419 default:
422
423 if ((data + 6) > end) {
424 /*
425 * ASCII case-insensitive match for the word "PUBLIC"
426 * U+0044 character (P) or U+0050 character (p)
427 */
428 if (*data == 0x50 || *data == 0x70) {
429 tkz->markup = (lxb_char_t *) "PUBLIC";
430
431 tkz->state =
432 lxb_html_tokenizer_state_doctype_after_name_public;
433
434 return data;
435 }
436
437 /*
438 * ASCII case-insensitive match for the word "SYSTEM"
439 * U+0044 character (S) or U+0053 character (s)
440 */
441 if (*data == 0x53 || *data == 0x73) {
442 tkz->markup = (lxb_char_t *) "SYSTEM";
443
444 tkz->state =
445 lxb_html_tokenizer_state_doctype_after_name_system;
446
447 return data;
448 }
449 }
450 else if (lexbor_str_data_ncasecmp((lxb_char_t *) "PUBLIC",
451 data, 6))
452 {
454 (data + 6));
455
456 attr_data = lxb_dom_attr_data_by_id(tkz->attrs,
458 if (attr_data == NULL) {
460 return end;
461 }
462
463 tkz->token->attr_last->name = attr_data;
464
465 tkz->state =
466 lxb_html_tokenizer_state_doctype_after_public_keyword;
467
468 return (data + 6);
469 }
470 else if (lexbor_str_data_ncasecmp((lxb_char_t *) "SYSTEM",
471 data, 6))
472 {
474 (data + 6));
475
476 attr_data = lxb_dom_attr_data_by_id(tkz->attrs,
478 if (attr_data == NULL) {
480 return end;
481 }
482
483 tkz->token->attr_last->name = attr_data;
484
485 tkz->state =
486 lxb_html_tokenizer_state_doctype_after_system_keyword;
487
488 return (data + 6);
489 }
490
492 tkz->dobj_token_attr);
493
496
498 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
499
500 return data;
501 }
502
503 data++;
504 }
505
506 return data;
507}
508
509/*
510 * Helper function. No in the specification. For 12.2.5.56
511 * For doctype PUBLIC
512 */
513static const lxb_char_t *
514lxb_html_tokenizer_state_doctype_after_name_public(lxb_html_tokenizer_t *tkz,
515 const lxb_char_t *data,
516 const lxb_char_t *end)
517{
518 const lxb_char_t *pos;
519 const lxb_dom_attr_data_t *attr_data;
520
522
523 if (pos == NULL) {
525 tkz->dobj_token_attr);
526
529
530 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
531
532 return data;
533 }
534
535 if (*pos == '\0') {
536 pos = data + (pos - tkz->markup);
537
539
540 attr_data = lxb_dom_attr_data_by_id(tkz->attrs,
542 if (attr_data == NULL) {
544 return end;
545 }
546
547 tkz->token->attr_last->name = attr_data;
548
549 tkz->state = lxb_html_tokenizer_state_doctype_after_public_keyword;
550
551 return (pos + 1);
552 }
553
554 tkz->markup = pos;
555
556 return end;
557}
558
559/*
560 * Helper function. No in the specification. For 12.2.5.56
561 * For doctype SYSTEM
562 */
563static const lxb_char_t *
564lxb_html_tokenizer_state_doctype_after_name_system(lxb_html_tokenizer_t *tkz,
565 const lxb_char_t *data,
566 const lxb_char_t *end)
567{
568 const lxb_char_t *pos;
569 const lxb_dom_attr_data_t *attr_data;
570
572
573 if (pos == NULL) {
575 tkz->dobj_token_attr);
576
579
580 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
581
582 return data;
583 }
584
585 if (*pos == '\0') {
586 pos = data + (pos - tkz->markup);
587
589
590 attr_data = lxb_dom_attr_data_by_id(tkz->attrs,
592 if (attr_data == NULL) {
594 return end;
595 }
596
597 tkz->token->attr_last->name = attr_data;
598
599 tkz->state = lxb_html_tokenizer_state_doctype_after_system_keyword;
600
601 return (pos + 1);
602 }
603
604 tkz->markup = pos;
605
606 return end;
607}
608
609/*
610 * 12.2.5.57 After DOCTYPE public keyword state
611 */
612static const lxb_char_t *
613lxb_html_tokenizer_state_doctype_after_public_keyword(lxb_html_tokenizer_t *tkz,
614 const lxb_char_t *data,
615 const lxb_char_t *end)
616{
617 switch (*data) {
618 /*
619 * U+0009 CHARACTER TABULATION (tab)
620 * U+000A LINE FEED (LF)
621 * U+000C FORM FEED (FF)
622 * U+000D CARRIAGE RETURN (CR)
623 * U+0020 SPACE
624 */
625 case 0x09:
626 case 0x0A:
627 case 0x0C:
628 case 0x0D:
629 case 0x20:
630 tkz->state =
631 lxb_html_tokenizer_state_doctype_before_public_identifier;
632
633 return (data + 1);
634
635 /* U+0022 QUOTATION MARK (") */
636 case 0x22:
639
640 tkz->state =
641 lxb_html_tokenizer_state_doctype_public_identifier_double_quoted;
642
643 return (data + 1);
644
645 /* U+0027 APOSTROPHE (') */
646 case 0x27:
649
650 tkz->state =
651 lxb_html_tokenizer_state_doctype_public_identifier_single_quoted;
652
653 return (data + 1);
654
655 /* U+003E GREATER-THAN SIGN (>) */
656 case 0x3E:
659
662
664
665 return (data + 1);
666
667 /* EOF */
668 case 0x00:
669 if (tkz->is_eof) {
671
674
676
677 return end;
678 }
679 /* fall through */
680
681 default:
683 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
684
687
688 return data;
689 }
690
691 return data;
692}
693
694/*
695 * 12.2.5.58 Before DOCTYPE public identifier state
696 */
697static const lxb_char_t *
698lxb_html_tokenizer_state_doctype_before_public_identifier(lxb_html_tokenizer_t *tkz,
699 const lxb_char_t *data,
700 const lxb_char_t *end)
701{
702 switch (*data) {
703 /*
704 * U+0009 CHARACTER TABULATION (tab)
705 * U+000A LINE FEED (LF)
706 * U+000C FORM FEED (FF)
707 * U+000D CARRIAGE RETURN (CR)
708 * U+0020 SPACE
709 */
710 case 0x09:
711 case 0x0A:
712 case 0x0C:
713 case 0x0D:
714 case 0x20:
715 break;
716
717 /* U+0022 QUOTATION MARK (") */
718 case 0x22:
719 tkz->state =
720 lxb_html_tokenizer_state_doctype_public_identifier_double_quoted;
721
722 break;
723
724 /* U+0027 APOSTROPHE (') */
725 case 0x27:
726 tkz->state =
727 lxb_html_tokenizer_state_doctype_public_identifier_single_quoted;
728
729 break;
730
731 /* U+003E GREATER-THAN SIGN (>) */
732 case 0x3E:
735
738
740
741 break;
742
743 /* EOF */
744 case 0x00:
745 if (tkz->is_eof) {
748
750
752
753 return end;
754 }
755 /* fall through */
756
757 default:
760
762 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
763
764 return data;
765 }
766
767 return (data + 1);
768}
769
770/*
771 * 12.2.5.59 DOCTYPE public identifier (double-quoted) state
772 */
773static const lxb_char_t *
774lxb_html_tokenizer_state_doctype_public_identifier_double_quoted(lxb_html_tokenizer_t *tkz,
775 const lxb_char_t *data,
776 const lxb_char_t *end)
777{
778 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
780 }
781
783
784 while (data != end) {
785 switch (*data) {
786 /* U+0022 QUOTATION MARK (") */
787 case 0x22:
791
792 tkz->state =
793 lxb_html_tokenizer_state_doctype_after_public_identifier;
794
795 return (data + 1);
796
797 /* U+003E GREATER-THAN SIGN (>) */
798 case 0x3E:
800
803
808
809 return (data + 1);
810
811 /* U+000D CARRIAGE RETURN (CR) */
812 case 0x0D:
813 if (++data >= end) {
815
817 tkz->state_return = lxb_html_tokenizer_state_doctype_public_identifier_double_quoted;
818
819 return data;
820 }
821
823 tkz->pos[-1] = 0x0A;
824
826
827 if (*data != 0x0A) {
829 data--;
830 }
831
832 break;
833
834 /*
835 * U+0000 NULL
836 * EOF
837 */
838 case 0x00:
840
841 if (tkz->is_eof) {
843
844 if (tkz->token->attr_last->value_begin == NULL) {
846 tkz->token->attr_last->value_end);
847 }
848
852
854
857
858 return end;
859 }
860
863
866
867 tkz->token->attr_last->type
869
870 break;
871
872 /* Anything else */
873 default:
874 break;
875 }
876
877 data++;
878 }
879
881
882 return data;
883}
884
885/*
886 * 12.2.5.60 DOCTYPE public identifier (single-quoted) state
887 */
888static const lxb_char_t *
889lxb_html_tokenizer_state_doctype_public_identifier_single_quoted(lxb_html_tokenizer_t *tkz,
890 const lxb_char_t *data,
891 const lxb_char_t *end)
892{
893 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
895 }
896
898
899 while (data != end) {
900 switch (*data) {
901 /* U+0027 APOSTROPHE (') */
902 case 0x27:
906
907 tkz->state =
908 lxb_html_tokenizer_state_doctype_after_public_identifier;
909
910 return (data + 1);
911
912 /* U+003E GREATER-THAN SIGN (>) */
913 case 0x3E:
915
918
923
924 return (data + 1);
925
926 /* U+000D CARRIAGE RETURN (CR) */
927 case 0x0D:
928 if (++data >= end) {
930
932 tkz->state_return = lxb_html_tokenizer_state_doctype_public_identifier_single_quoted;
933
934 return data;
935 }
936
938 tkz->pos[-1] = 0x0A;
939
941
942 if (*data != 0x0A) {
944 data--;
945 }
946
947 break;
948
949 /*
950 * U+0000 NULL
951 * EOF
952 */
953 case 0x00:
955
956 if (tkz->is_eof) {
958
959 if (tkz->token->attr_last->value_begin == NULL) {
961 tkz->token->attr_last->value_end);
962 }
963
967
969
972
973 return end;
974 }
975
978
981
982 tkz->token->attr_last->type
984
985 break;
986
987 /* Anything else */
988 default:
989 break;
990 }
991
992 data++;
993 }
994
996
997 return data;
998}
999
1000/*
1001 * 12.2.5.61 After DOCTYPE public identifier state
1002 */
1003static const lxb_char_t *
1004lxb_html_tokenizer_state_doctype_after_public_identifier(lxb_html_tokenizer_t *tkz,
1005 const lxb_char_t *data,
1006 const lxb_char_t *end)
1007{
1009
1010 switch (*data) {
1011 /*
1012 * U+0009 CHARACTER TABULATION (tab)
1013 * U+000A LINE FEED (LF)
1014 * U+000C FORM FEED (FF)
1015 * U+000D CARRIAGE RETURN (CR)
1016 * U+0020 SPACE
1017 */
1018 case 0x09:
1019 case 0x0A:
1020 case 0x0C:
1021 case 0x0D:
1022 case 0x20:
1023 tkz->state =
1024 lxb_html_tokenizer_state_doctype_between_public_and_system_identifiers;
1025
1026 return (data + 1);
1027
1028 /* U+003E GREATER-THAN SIGN (>) */
1029 case 0x3E:
1031
1033
1034 return (data + 1);
1035
1036 /* U+0022 QUOTATION MARK (") */
1037 case 0x22:
1040
1042
1043 tkz->state =
1044 lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1045
1046 return (data + 1);
1047
1048 /* U+0027 APOSTROPHE (') */
1049 case 0x27:
1052
1054
1055 tkz->state =
1056 lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1057
1058 return (data + 1);
1059
1060 /* EOF */
1061 case 0x00:
1062 if (tkz->is_eof) {
1065
1068
1069 return end;
1070 }
1071 /* fall through */
1072
1073 default:
1076
1078 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1079
1080 return data;
1081 }
1082
1083 return data;
1084}
1085
1086/*
1087 * 12.2.5.62 Between DOCTYPE public and system identifiers state
1088 */
1089static const lxb_char_t *
1090lxb_html_tokenizer_state_doctype_between_public_and_system_identifiers(lxb_html_tokenizer_t *tkz,
1091 const lxb_char_t *data,
1092 const lxb_char_t *end)
1093{
1095
1096 switch (*data) {
1097 /*
1098 * U+0009 CHARACTER TABULATION (tab)
1099 * U+000A LINE FEED (LF)
1100 * U+000C FORM FEED (FF)
1101 * U+000D CARRIAGE RETURN (CR)
1102 * U+0020 SPACE
1103 */
1104 case 0x09:
1105 case 0x0A:
1106 case 0x0C:
1107 case 0x0D:
1108 case 0x20:
1109 return (data + 1);
1110
1111 /* U+003E GREATER-THAN SIGN (>) */
1112 case 0x3E:
1114
1116
1117 return (data + 1);
1118
1119 /* U+0022 QUOTATION MARK (") */
1120 case 0x22:
1122
1123 tkz->state =
1124 lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1125
1126 return (data + 1);
1127
1128 /* U+0027 APOSTROPHE (') */
1129 case 0x27:
1131
1132 tkz->state =
1133 lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1134
1135 return (data + 1);
1136
1137 /* EOF */
1138 case 0x00:
1139 if (tkz->is_eof) {
1142
1145
1146 return end;
1147 }
1148 /* fall through */
1149
1150 default:
1153
1155 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1156
1157 return data;
1158 }
1159
1160 return data;
1161}
1162
1163/*
1164 * 12.2.5.63 After DOCTYPE system keyword state
1165 */
1166static const lxb_char_t *
1167lxb_html_tokenizer_state_doctype_after_system_keyword(lxb_html_tokenizer_t *tkz,
1168 const lxb_char_t *data,
1169 const lxb_char_t *end)
1170{
1171 switch (*data) {
1172 /*
1173 * U+0009 CHARACTER TABULATION (tab)
1174 * U+000A LINE FEED (LF)
1175 * U+000C FORM FEED (FF)
1176 * U+000D CARRIAGE RETURN (CR)
1177 * U+0020 SPACE
1178 */
1179 case 0x09:
1180 case 0x0A:
1181 case 0x0C:
1182 case 0x0D:
1183 case 0x20:
1184 tkz->state =
1185 lxb_html_tokenizer_state_doctype_before_system_identifier;
1186
1187 return (data + 1);
1188
1189 /* U+0022 QUOTATION MARK (") */
1190 case 0x22:
1193
1194 tkz->state =
1195 lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1196
1197 return (data + 1);
1198
1199 /* U+0027 APOSTROPHE (') */
1200 case 0x27:
1203
1204 tkz->state =
1205 lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1206
1207 return (data + 1);
1208
1209 /* U+003E GREATER-THAN SIGN (>) */
1210 case 0x3E:
1213
1216
1218
1219 return (data + 1);
1220
1221 /* EOF */
1222 case 0x00:
1223 if (tkz->is_eof) {
1226
1229
1230 return end;
1231 }
1232 /* fall through */
1233
1234 default:
1236 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1237
1240
1241 return data;
1242 }
1243
1244 return data;
1245}
1246
1247/*
1248 * 12.2.5.64 Before DOCTYPE system identifier state
1249 */
1250static const lxb_char_t *
1251lxb_html_tokenizer_state_doctype_before_system_identifier(lxb_html_tokenizer_t *tkz,
1252 const lxb_char_t *data,
1253 const lxb_char_t *end)
1254{
1255 switch (*data) {
1256 /*
1257 * U+0009 CHARACTER TABULATION (tab)
1258 * U+000A LINE FEED (LF)
1259 * U+000C FORM FEED (FF)
1260 * U+000D CARRIAGE RETURN (CR)
1261 * U+0020 SPACE
1262 */
1263 case 0x09:
1264 case 0x0A:
1265 case 0x0C:
1266 case 0x0D:
1267 case 0x20:
1268 return (data + 1);
1269
1270 /* U+0022 QUOTATION MARK (") */
1271 case 0x22:
1272 tkz->state =
1273 lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1274
1275 return (data + 1);
1276
1277 /* U+0027 APOSTROPHE (') */
1278 case 0x27:
1279 tkz->state =
1280 lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1281
1282 return (data + 1);
1283
1284 /* U+003E GREATER-THAN SIGN (>) */
1285 case 0x3E:
1288
1291
1293
1294 return (data + 1);
1295
1296 /* EOF */
1297 case 0x00:
1298 if (tkz->is_eof) {
1301
1303
1305
1306 return end;
1307 }
1308 /* fall through */
1309
1310 default:
1312 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1313
1316
1317 return data;
1318 }
1319
1320 return data;
1321}
1322
1323/*
1324 * 12.2.5.65 DOCTYPE system identifier (double-quoted) state
1325 */
1326static const lxb_char_t *
1327lxb_html_tokenizer_state_doctype_system_identifier_double_quoted(lxb_html_tokenizer_t *tkz,
1328 const lxb_char_t *data,
1329 const lxb_char_t *end)
1330{
1331 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
1333 }
1334
1336
1337 while (data != end) {
1338 switch (*data) {
1339 /* U+0022 QUOTATION MARK (") */
1340 case 0x22:
1344
1345 tkz->state =
1346 lxb_html_tokenizer_state_doctype_after_system_identifier;
1347
1348 return (data + 1);
1349
1350 /* U+003E GREATER-THAN SIGN (>) */
1351 case 0x3E:
1353
1356
1361
1362 return (data + 1);
1363
1364 /* U+000D CARRIAGE RETURN (CR) */
1365 case 0x0D:
1366 if (++data >= end) {
1368
1370 tkz->state_return = lxb_html_tokenizer_state_doctype_system_identifier_double_quoted;
1371
1372 return data;
1373 }
1374
1376 tkz->pos[-1] = 0x0A;
1377
1379
1380 if (*data != 0x0A) {
1382 data--;
1383 }
1384
1385 break;
1386
1387 /*
1388 * U+0000 NULL
1389 * EOF
1390 */
1391 case 0x00:
1393
1394 if (tkz->is_eof) {
1396
1397 if (tkz->token->attr_last->value_begin == NULL) {
1399 tkz->token->attr_last->value_end);
1400 }
1401
1403 tkz->token->attr_last->value_end,
1405
1407
1410
1411 return end;
1412 }
1413
1416
1419
1420 tkz->token->attr_last->type
1422
1423 break;
1424
1425 /* Anything else */
1426 default:
1427 break;
1428 }
1429
1430 data++;
1431 }
1432
1434
1435 return data;
1436}
1437
1438/*
1439 * 12.2.5.66 DOCTYPE system identifier (single-quoted) state
1440 */
1441static const lxb_char_t *
1442lxb_html_tokenizer_state_doctype_system_identifier_single_quoted(lxb_html_tokenizer_t *tkz,
1443 const lxb_char_t *data,
1444 const lxb_char_t *end)
1445{
1446 if (tkz->token->attr_last->value_begin == NULL && tkz->is_eof == false) {
1448 }
1449
1451
1452 while (data != end) {
1453 switch (*data) {
1454 /* U+0027 APOSTROPHE (') */
1455 case 0x27:
1459
1460 tkz->state =
1461 lxb_html_tokenizer_state_doctype_after_system_identifier;
1462
1463 return (data + 1);
1464
1465 /* U+003E GREATER-THAN SIGN (>) */
1466 case 0x3E:
1468
1471
1476
1477 return (data + 1);
1478
1479 /* U+000D CARRIAGE RETURN (CR) */
1480 case 0x0D:
1481 if (++data >= end) {
1483
1485 tkz->state_return = lxb_html_tokenizer_state_doctype_system_identifier_single_quoted;
1486
1487 return data;
1488 }
1489
1491 tkz->pos[-1] = 0x0A;
1492
1494
1495 if (*data != 0x0A) {
1497 data--;
1498 }
1499
1500 break;
1501
1502 /*
1503 * U+0000 NULL
1504 * EOF
1505 */
1506 case 0x00:
1508
1509 if (tkz->is_eof) {
1511
1512 if (tkz->token->attr_last->value_begin == NULL) {
1514 tkz->token->attr_last->value_end);
1515 }
1516
1518 tkz->token->attr_last->value_end,
1520
1522
1525
1526 return end;
1527 }
1528
1531
1534
1535 tkz->token->attr_last->type
1537
1538 break;
1539
1540 /* Anything else */
1541 default:
1542 break;
1543 }
1544
1545 data++;
1546 }
1547
1549
1550 return data;
1551}
1552
1553/*
1554 * 12.2.5.67 After DOCTYPE system identifier state
1555 */
1556static const lxb_char_t *
1557lxb_html_tokenizer_state_doctype_after_system_identifier(
1559 const lxb_char_t *data,
1560 const lxb_char_t *end)
1561{
1562 switch (*data) {
1563 /*
1564 * U+0009 CHARACTER TABULATION (tab)
1565 * U+000A LINE FEED (LF)
1566 * U+000C FORM FEED (FF)
1567 * U+000D CARRIAGE RETURN (CR)
1568 * U+0020 SPACE
1569 */
1570 case 0x09:
1571 case 0x0A:
1572 case 0x0C:
1573 case 0x0D:
1574 case 0x20:
1575 return (data + 1);
1576
1577 /* U+003E GREATER-THAN SIGN (>) */
1578 case 0x3E:
1580
1582
1583 return (data + 1);
1584
1585 /* EOF */
1586 case 0x00:
1587 if (tkz->is_eof) {
1590
1592
1594
1595 return end;
1596 }
1597 /* fall through */
1598
1599 default:
1602
1603 tkz->state = lxb_html_tokenizer_state_doctype_bogus;
1604
1605 return data;
1606 }
1607
1608 return data;
1609}
1610
1611/*
1612 * 12.2.5.68 Bogus DOCTYPE state
1613 */
1614static const lxb_char_t *
1615lxb_html_tokenizer_state_doctype_bogus(lxb_html_tokenizer_t *tkz,
1616 const lxb_char_t *data,
1617 const lxb_char_t *end)
1618{
1619 while (data != end) {
1620 switch (*data) {
1621 /* U+003E GREATER-THAN SIGN (>) */
1622 case 0x3E:
1624
1626
1627 return (data + 1);
1628
1629 /*
1630 * U+0000 NULL
1631 * EOF
1632 */
1633 case 0x00:
1634 if (tkz->is_eof) {
1636
1637 return end;
1638 }
1639
1642
1643 break;
1644
1645 /* Anything else */
1646 default:
1647 break;
1648 }
1649
1650 data++;
1651 }
1652
1653 return data;
1654}
@ LXB_DOM_ATTR_SYSTEM
Definition attr_const.h:53
@ LXB_DOM_ATTR_PUBLIC
Definition attr_const.h:44
@ LXB_STATUS_ERROR
Definition base.h:50
new_type attr
Definition ffi.c:4364
#define NULL
Definition gdcache.h:45
struct lexbor_hash lexbor_hash_t
Definition hash.h:41
hash(string $algo, string $data, bool $binary=false, array $options=[])
Definition hash.stub.php:12
struct lxb_html_tokenizer lxb_html_tokenizer_t
Definition base.h:26
void lxb_html_token_attr_delete(lxb_html_token_t *token, lxb_html_token_attr_t *attr, lexbor_dobject_t *dobj)
Definition token.c:83
@ LXB_HTML_TOKEN_TYPE_FORCE_QUIRKS
Definition token.h:29
lxb_html_tokenizer_error_t * lxb_html_tokenizer_error_add(lexbor_array_obj_t *parse_errors, const lxb_char_t *pos, lxb_html_tokenizer_error_id_t id)
Definition error.c:11
@ LXB_HTML_TOKENIZER_ERROR_INCHSEAFDONA
Definition error.h:60
@ LXB_HTML_TOKENIZER_ERROR_MIDOPUID
Definition error.h:68
@ LXB_HTML_TOKENIZER_ERROR_MIDOSYID
Definition error.h:70
@ LXB_HTML_TOKENIZER_ERROR_ABDOPUID
Definition error.h:24
@ LXB_HTML_TOKENIZER_ERROR_MIWHBEDONA
Definition error.h:84
@ LXB_HTML_TOKENIZER_ERROR_MIWHAFDOPUKE
Definition error.h:80
@ LXB_HTML_TOKENIZER_ERROR_MIWHAFDOSYKE
Definition error.h:82
@ LXB_HTML_TOKENIZER_ERROR_UNNUCH
Definition error.h:112
@ LXB_HTML_TOKENIZER_ERROR_UNCHAFDOSYID
Definition error.h:104
@ LXB_HTML_TOKENIZER_ERROR_MIDONA
Definition error.h:66
@ LXB_HTML_TOKENIZER_ERROR_MIQUBEDOPUID
Definition error.h:74
@ LXB_HTML_TOKENIZER_ERROR_MIQUBEDOSYID
Definition error.h:76
@ LXB_HTML_TOKENIZER_ERROR_MIWHBEDOPUANSYID
Definition error.h:88
@ LXB_HTML_TOKENIZER_ERROR_ABDOSYID
Definition error.h:26
@ LXB_HTML_TOKENIZER_ERROR_EOINDO
Definition error.h:50
const lxb_char_t * lxb_html_tokenizer_state_data_before(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
Definition state.c:204
const lxb_char_t * lxb_html_tokenizer_state_cr(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
Definition state.c:1257
#define lxb_html_tokenizer_state_token_attr_set_value_end(tkz, v_end)
Definition state.h:141
#define lxb_html_tokenizer_state_token_attr_add_m(tkz, attr, v_return)
Definition state.h:111
#define lxb_html_tokenizer_state_token_attr_set_name_end_oef(tkz)
Definition state.h:131
#define lxb_html_tokenizer_state_token_attr_set_value_end_oef(tkz)
Definition state.h:144
#define lxb_html_tokenizer_state_append_data_m(tkz, v_data)
Definition state.h:19
#define lxb_html_tokenizer_state_token_attr_set_name_begin(tkz, v_begin)
Definition state.h:121
#define lxb_html_tokenizer_state_append_replace_m(tkz)
Definition state.h:37
#define lxb_html_tokenizer_state_set_name_m(tkz)
Definition state.h:60
#define lxb_html_tokenizer_state_token_done_m(tkz, v_end)
Definition state.h:157
#define lxb_html_tokenizer_state_token_attr_set_name_end(tkz, v_end)
Definition state.h:128
#define lxb_html_tokenizer_state_token_set_end(tkz, v_end)
Definition state.h:98
#define lxb_html_tokenizer_state_token_set_end_oef(tkz)
Definition state.h:108
#define lxb_html_tokenizer_state_token_attr_set_value_begin(tkz, v_begin)
Definition state.h:134
#define lxb_html_tokenizer_state_begin_set(tkz, v_data)
Definition state.h:16
#define lxb_html_tokenizer_state_set_value_m(tkz)
Definition state.h:73
const lxb_dom_attr_data_t * lxb_dom_attr_data_by_id(lexbor_hash_t *hash, lxb_dom_attr_id_t attr_id)
Definition attr.c:411
unsigned const char * end
Definition php_ffi.h:51
unsigned const char * pos
Definition php_ffi.h:52
zend_constant * data
lxb_dom_attr_data_t * lxb_dom_attr_local_name_append(lexbor_hash_t *hash, const lxb_char_t *name, size_t length)
Definition attr.c:358
const lxb_char_t * lxb_html_tokenizer_state_doctype_before(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, const lxb_char_t *end)
bool lexbor_str_data_ncasecmp(const lxb_char_t *first, const lxb_char_t *sec, size_t size)
Definition str.c:435
const lxb_char_t * lexbor_str_data_ncasecmp_first(const lxb_char_t *first, const lxb_char_t *sec, size_t sec_size)
Definition str.c:384
const lxb_char_t * value_begin
Definition token_attr.h:35
const lxb_dom_attr_data_t * name
Definition token_attr.h:38
const lxb_char_t * value_end
Definition token_attr.h:36
lxb_html_token_attr_type_t type
Definition token_attr.h:45
const lxb_char_t * name_end
Definition token_attr.h:33
lxb_html_token_type_t type
Definition token.h:49
lxb_tag_id_t tag_id
Definition token.h:48
lxb_html_token_attr_t * attr_last
Definition token.h:43
lxb_html_token_t * token
Definition tokenizer.h:49
const lxb_char_t * markup
Definition tokenizer.h:67
lxb_char_t * pos
Definition tokenizer.h:72
lxb_html_tokenizer_state_f state_return
Definition tokenizer.h:36
lxb_html_tokenizer_state_f state
Definition tokenizer.h:35
lexbor_hash_t * attrs
Definition tokenizer.h:42
lexbor_array_obj_t * parse_errors
Definition tokenizer.h:56
lxb_status_t status
Definition tokenizer.h:90
lexbor_dobject_t * dobj_token_attr
Definition tokenizer.h:53
const lxb_char_t * last
Definition tokenizer.h:75
@ LXB_TAG__EM_DOCTYPE
Definition const.h:29
@ LXB_HTML_TOKEN_ATTR_TYPE_VALUE_NULL
Definition token_attr.h:28
@ LXB_HTML_TOKEN_ATTR_TYPE_NAME_NULL
Definition token_attr.h:27
struct lxb_html_token_attr lxb_html_token_attr_t
Definition token_attr.h:22
unsigned char lxb_char_t
Definition types.h:27
zend_string * name