php-internal-docs 8.4.8
Unofficial docs for php/php-src
Loading...
Searching...
No Matches
mime_sniff.c
Go to the documentation of this file.
1/*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Authors: Niels Dossche <nielsdos@php.net> |
14 +----------------------------------------------------------------------+
15*/
16
17/* This file implements the MIME sniff algorithm from https://mimesniff.spec.whatwg.org/#parsing-a-mime-type (Date: 2023-09-27)
18 * It is a strict implementation of the algorithm, i.e. it does not accept malformed headers.
19 * In particular, it exposes php_dom_sniff_charset() to parse the charset from the Content-Type header.
20 */
21
22#ifdef HAVE_CONFIG_H
23#include "config.h"
24#endif
25
26#include "php.h"
27#ifdef HAVE_LIBXML
28
29#include "php_libxml.h"
30
31static bool is_not_slash(char c)
32{
33 return c != '/';
34}
35
36static bool is_not_semicolon(char c)
37{
38 return c != ';';
39}
40
41static bool is_not_semicolon_or_equals(char c)
42{
43 return c != ';' && c != '=';
44}
45
46static bool is_not_quote_or_backslash(char c)
47{
48 return c != '"' && c != '\\';
49}
50
51/* https://fetch.spec.whatwg.org/#http-tab-or-space */
52static bool is_http_tab_or_space(char c)
53{
54 return c == 0x09 || c == 0x20;
55}
56
57/* https://fetch.spec.whatwg.org/#http-whitespace */
58static bool is_http_whitespace(char c)
59{
60 return c == 0x0A || c == 0x0D || is_http_tab_or_space(c);
61}
62
63/* https://mimesniff.spec.whatwg.org/#http-quoted-string-token-code-point */
64static bool is_http_quoted_string_token(unsigned char c) /* Note: unsigned is important to let the >= 0x20 check work properly! */
65{
66 return c == 0x09 || (c >= 0x20 && c != 0x7F);
67}
68
69/* https://infra.spec.whatwg.org/#collect-a-sequence-of-code-points
70 * Implemented by returning the length of the sequence */
71static zend_always_inline size_t collect_a_sequence_of_code_points(const char *position, const char *end, bool (*condition)(char))
72{
73 const char *start = position;
74 while (position < end && condition(*position)) {
75 position++;
76 }
77 return position - start;
78}
79
80/* https://fetch.spec.whatwg.org/#collect-an-http-quoted-string with extract-value always true */
81static zend_string *collect_an_http_quoted_string_with_extract_value(const char *position, const char *end, const char **position_out)
82{
83 /* 1. Saving positionStart is not necessary, as in the extract-value == true variant we don't use it */
84
85 /* 2. Let value be the empty string */
86 zend_string *value = zend_string_alloc(end - position /* can't be longer than this */, false);
87 ZSTR_LEN(value) = 0;
88
89 /* 3. Assert */
90 ZEND_ASSERT(*position == '"');
91
92 /* 4. Advance */
93 position++;
94
95 /* 5. While true */
96 while (true) {
97 /* 5.1. Append the result of collect a sequence of code points that are not '"' or '\\' */
98 size_t length = collect_a_sequence_of_code_points(position, end, is_not_quote_or_backslash);
99 memcpy(ZSTR_VAL(value) + ZSTR_LEN(value), position, length);
100 ZSTR_LEN(value) += length;
101 position += length;
102
103 /* 5.2. Past end check */
104 if (position >= end) {
105 break;
106 }
107
108 /* 5.3. quoteOrBackslash is the code point at position */
109 char quote_or_backslash = *position;
110
111 /* 5.4. Advance */
112 position++;
113
114 /* 5.5. quote_or_backslash is '\\', deal with escaping */
115 if (quote_or_backslash == '\\') {
116 /* 5.5.1. Past end check */
117 if (position >= end) {
118 ZSTR_VAL(value)[ZSTR_LEN(value)] = '\\';
119 ZSTR_LEN(value)++;
120 break;
121 }
122
123 /* 5.5.2. Append code point at position */
124 ZSTR_VAL(value)[ZSTR_LEN(value)] = *position;
125 ZSTR_LEN(value)++;
126
127 /* 5.5.3. Advance */
128 position++;
129 } else {
130 /* 5.6. Otherwise: assert and break */
131 ZEND_ASSERT(quote_or_backslash == '"');
132 break;
133 }
134 }
135
136 ZSTR_VAL(value)[ZSTR_LEN(value)] = '\0';
137
138 *position_out = position;
139
140 /* 6. extract-value is always true, return value */
141 /* Step 7 is not needed because we always return here already */
142 return value;
143}
144
145/* https://infra.spec.whatwg.org/#ascii-alphanumeric */
146static bool is_ascii_alpha_numeric(char c)
147{
148 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
149}
150
151/* https://mimesniff.spec.whatwg.org/#http-token-code-point */
152static bool is_http_token(char c)
153{
154 return c == 0x21
155 || (c >= 0x23 && c <= 0x27)
156 || c == 0x2A || c == 0x2B || c == 0x2D || c == 0x2E
157 || c == 0x5E || c == 0x5F
158 || c == 0x60
159 || c == 0x7C || c == 0x7E
160 || is_ascii_alpha_numeric(c);
161}
162
163static bool is_empty_string_or_does_not_solely_contain_http_token_code_points(const char *start, size_t len)
164{
165 if (len == 0) {
166 return true;
167 }
168 while (len > 0) {
169 if (!is_http_token(*start)) {
170 return true;
171 }
172 len--;
173 start++;
174 }
175 return false;
176}
177
178static bool solely_contains_http_quoted_string_tokens(const char *start, size_t len)
179{
180 while (len > 0) {
181 if (!is_http_quoted_string_token(*start)) {
182 return false;
183 }
184 len--;
185 start++;
186 }
187 return true;
188}
189
190/* https://mimesniff.spec.whatwg.org/#parsing-a-mime-type
191 * Note: We only care about the charset detection */
192PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_string(const char *start, const char *end)
193{
194 /* 1. Remove leading & trailing HTTP whitespace */
195 while (start < end && is_http_whitespace(*start)) {
196 start++;
197 }
198 while (start < end && is_http_whitespace(*(end - 1))) {
199 end--;
200 }
201
202 /* 2. Position variable: no-op because we move the start pointer instead */
203
204 /* 3. Collect sequence of code points that are not '/' (for type) */
205 size_t type_length = collect_a_sequence_of_code_points(start, end, is_not_slash);
206
207 /* 4. Empty string or not solely http tokens */
208 if (is_empty_string_or_does_not_solely_contain_http_token_code_points(start, type_length)) {
209 return NULL;
210 }
211 start += type_length;
212
213 /* 5. Failure if past end of input (note: end is one past the last char; in practice this is only possible if no '/' was found) */
214 if (start >= end) {
215 return NULL;
216 }
217
218 /* 6. Skip '/' */
219 start++;
220
221 /* 7. Collect sequence of code points that are not ';' (for subtype) */
222 size_t subtype_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon);
223
224 /* 8. Remove trailing HTTP whitespace from subtype, but we don't care about subtype, so no-op */
225
226 /* 9. Empty string or not solely http tokens */
227 if (is_empty_string_or_does_not_solely_contain_http_token_code_points(start, subtype_length)) {
228 return NULL;
229 }
230 start += subtype_length;
231
232 /* 10. Initialise stuff, no-op as well as we don't care about anything other than charset */
233
234 /* 11. Loop with check: position not past end */
235 while (start < end) {
236 /* 11.1. Advance position */
237 start++;
238
239 /* 11.2. Collect sequence that *is* HTTP whitespace */
240 size_t whitespace_length = collect_a_sequence_of_code_points(start, end, is_http_whitespace);
241 start += whitespace_length;
242
243 /* 11.3. Collect a sequence of code points that are not ';' or '=' (for parameterName) */
244 size_t parameter_name_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon_or_equals);
245 const char *parameter_name = start;
246 start += parameter_name_length;
247
248 /* 11.4. Convert parameter_name to ASCII lowercase, no-op because we are only interested in charset which we'll match down below */
249
250 /* 11.5. Position past input check */
251 if (start < end) {
252 if (*start == ';') {
253 continue;
254 }
255 start++;
256 } else {
257 /* 11.6. */
258 break;
259 }
260
261 /* 11.7. Let parameterValue be null */
262 zend_string *parameter_value = NULL;
263
264 /* 11.8. Quoted string check */
265 if (*start == '"') {
266 /* 11.8.1. Set parameterValue to the result of collecting an HTTP quoted string */
267 parameter_value = collect_an_http_quoted_string_with_extract_value(start, end, &start);
268
269 /* 11.8.2. Collect a sequence of code points that are not ';' */
270 start += collect_a_sequence_of_code_points(start, end, is_not_semicolon);
271 } else {
272 /* 11.9. Otherwise */
273 /* 11.9.1. Set parameterValue to the result of collecting a sequence of code points that are not ';' */
274 size_t parameter_value_length = collect_a_sequence_of_code_points(start, end, is_not_semicolon);
275 parameter_value = zend_string_init(start, parameter_value_length, false);
276 start += parameter_name_length;
277
278 /* 11.9.2. Remove trailing HTTP whitespace from parameterValue */
279 while (ZSTR_LEN(parameter_value) > 0 && is_http_whitespace(ZSTR_VAL(parameter_value)[ZSTR_LEN(parameter_value) - 1])) {
280 ZSTR_LEN(parameter_value)--;
281 }
282 ZSTR_VAL(parameter_value)[ZSTR_LEN(parameter_value)] = '\0';
283
284 /* 11.9.3. Continue if parameterValue is empty */
285 if (ZSTR_LEN(parameter_value) == 0) {
286 zend_string_release_ex(parameter_value, false);
287 continue;
288 }
289 }
290
291 /* 11.10. We diverge from the spec here: we're only interested in charset.
292 * Furthermore, as only the first match matters, we can stop immediately with the loop once we set the charset. */
293 if (parameter_name_length == strlen("charset")
294 && strncasecmp(parameter_name, "charset", strlen("charset")) == 0 /* Because of lowercasing in step 11.4 */
295 && solely_contains_http_quoted_string_tokens(ZSTR_VAL(parameter_value), ZSTR_LEN(parameter_value))) {
296 return parameter_value;
297 }
298
299 zend_string_release_ex(parameter_value, false);
300 }
301
302 /* 12. Return mimetype, a no-op / spec divergence */
303 return NULL;
304}
305
306PHP_LIBXML_API zend_string *php_libxml_sniff_charset_from_stream(const php_stream *s)
307{
308 if (Z_TYPE(s->wrapperdata) == IS_ARRAY) {
309 zval *header;
310
311 /* Scan backwards: The header array might contain the headers for multiple responses, if
312 * a redirect was followed.
313 */
315 if (Z_TYPE_P(header) == IS_STRING) {
316 /* If no colon is found in the header, we assume it's the HTTP status line and bail out. */
317 char *colon = memchr(Z_STRVAL_P(header), ':', Z_STRLEN_P(header));
318 char *space = memchr(Z_STRVAL_P(header), ' ', Z_STRLEN_P(header));
319 if (colon == NULL || space < colon) {
320 return NULL;
321 }
322
323 if (zend_string_starts_with_literal_ci(Z_STR_P(header), "content-type:")) {
324 return php_libxml_sniff_charset_from_string(Z_STRVAL_P(header) + strlen("content-type:"), Z_STRVAL_P(header) + Z_STRLEN_P(header));
325 }
326 }
328 }
329
330 return NULL;
331}
332
333#endif /* HAVE_LIBXML */
size_t len
Definition apprentice.c:174
header(string $header, bool $replace=true, int $response_code=0)
char s[4]
Definition cdf.c:77
memcpy(ptr1, ptr2, size)
buf start
Definition ffi.c:4687
#define NULL
Definition gdcache.h:45
unsigned const char * end
Definition php_ffi.h:51
struct _php_stream php_stream
Definition php_streams.h:96
struct _zval_struct zval
strlen(string $string)
zend_string_release_ex(func->internal_function.function_name, 0)
#define strncasecmp(s1, s2, n)
#define ZEND_HASH_REVERSE_FOREACH_VAL_IND(ht, _val)
Definition zend_hash.h:1114
#define ZEND_HASH_FOREACH_END()
Definition zend_hash.h:1086
struct _zend_string zend_string
#define zend_always_inline
#define ZEND_ASSERT(c)
#define ZSTR_VAL(zstr)
Definition zend_string.h:68
#define ZSTR_LEN(zstr)
Definition zend_string.h:69
#define zend_string_starts_with_literal_ci(str, prefix)
#define Z_TYPE_P(zval_p)
Definition zend_types.h:660
#define Z_STRVAL_P(zval_p)
Definition zend_types.h:975
#define IS_STRING
Definition zend_types.h:606
#define IS_ARRAY
Definition zend_types.h:607
#define Z_STR_P(zval_p)
Definition zend_types.h:972
#define Z_STRLEN_P(zval_p)
Definition zend_types.h:978
#define Z_TYPE(zval)
Definition zend_types.h:659
#define Z_ARRVAL(zval)
Definition zend_types.h:986
value