38FILE_RCSID(
"@(#)$File: encoding.c,v 1.42 2022/12/26 17:31:14 christos Exp $")
63#define DPRINTF(a) printf a
77 const char **code_mime,
const char **
type)
79 const unsigned char *
buf =
CAST(
const unsigned char *, b->
fbuf);
80 size_t nbytes = b->
flen;
94 *code_mime =
"binary";
99 mlen = (nbytes + 1) *
sizeof((*ubuf)[0]);
108 *code =
"Unicode text, UTF-7";
109 *code_mime =
"utf-7";
113 *code_mime =
"us-ascii";
117 *code =
"Unicode text, UTF-8 (with BOM)";
118 *code_mime =
"utf-8";
121 *code =
"Unicode text, UTF-8";
122 *code_mime =
"utf-8";
123 }
else if ((ucs_type =
looks_ucs32(
buf, nbytes, *ubuf, ulen)) != 0) {
125 *code =
"Unicode text, UTF-32, little-endian";
126 *code_mime =
"utf-32le";
128 *code =
"Unicode text, UTF-32, big-endian";
129 *code_mime =
"utf-32be";
132 }
else if ((ucs_type =
looks_ucs16(
buf, nbytes, *ubuf, ulen)) != 0) {
134 *code =
"Unicode text, UTF-16, little-endian";
135 *code_mime =
"utf-16le";
137 *code =
"Unicode text, UTF-16, big-endian";
138 *code_mime =
"utf-16be";
144 *code_mime =
"iso-8859-1";
147 *code =
"Non-ISO extended-ASCII";
148 *code_mime =
"unknown-8bit";
152 mlen = (nbytes + 1) *
sizeof(nbuf[0]);
162 *code_mime =
"ebcdic";
166 *code =
"International EBCDIC";
167 *code_mime =
"ebcdic";
177 if (ubuf == &udefbuf)
242 F,
F,
F,
F,
F,
F,
F,
T,
T,
T,
T,
T,
T,
T,
F,
F,
244 F,
F,
F,
F,
F,
F,
F,
F,
F,
F,
F,
T,
F,
F,
F,
F,
245 T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
246 T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
247 T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
248 T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
249 T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
250 T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
T,
F,
252 X,
X,
X,
X,
X,
T,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
253 X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
X,
254 I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
255 I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
256 I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
257 I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
258 I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
259 I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I,
I
262#define LOOKS(NAME, COND) \
264looks_ ## NAME(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, \
271 for (i = 0; i < nbytes; i++) { \
272 int t = text_chars[buf[i]]; \
277 ubuf[(*ulen)++] = buf[i]; \
284LOOKS(extended, t !=
T && t !=
I && t !=
X)
314static const uint8_t first[] = {
316 AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
317 AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
318 AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
319 AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
320 AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
321 AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
322 AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
323 AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
AS,
325 XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
326 XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
327 XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
328 XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
329 XX,
XX,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
330 S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
S1,
331 S2,
S3,
S3,
S3,
S3,
S3,
S3,
S3,
S3,
S3,
S3,
S3,
S3,
S4,
S3,
S3,
332 S5,
S6,
S6,
S6,
S7,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
XX,
356 int gotone = 0, ctrl = 0;
361 for (i = 0; i < nbytes; i++) {
362 if ((
buf[i] & 0x80) == 0) {
372 ubuf[(*ulen)++] =
buf[i];
373 }
else if ((
buf[i] & 0x40) == 0) {
377 uint8_t x = first[
buf[i]];
383 if ((
buf[i] & 0x20) == 0) {
386 }
else if ((
buf[i] & 0x10) == 0) {
389 }
else if ((
buf[i] & 0x08) == 0) {
392 }
else if ((
buf[i] & 0x04) == 0) {
395 }
else if ((
buf[i] & 0x02) == 0) {
401 for (
n = 0;
n < following;
n++) {
410 if ((
buf[i] & 0x80) == 0 || (
buf[i] & 0x40))
413 c = (c << 6) + (
buf[i] & 0x3f);
422 return ctrl ? 0 : (gotone ? 2 : 1);
434 if (nbytes > 3 &&
buf[0] == 0xef &&
buf[1] == 0xbb &&
buf[2] == 0xbf)
444 if (nbytes > 4 &&
buf[0] ==
'+' &&
buf[1] ==
'/' &&
buf[2] ==
'v')
460#define UCS16_NOCHAR(c) ((c) >= 0xfdd0 && (c) <= 0xfdef)
461#define UCS16_HISURR(c) ((c) >= 0xd800 && (c) <= 0xdbff)
462#define UCS16_LOSURR(c) ((c) >= 0xdc00 && (c) <= 0xdfff)
475 if (bf[0] == 0xff && bf[1] == 0xfe)
477 else if (bf[0] == 0xfe && bf[1] == 0xff)
485 for (i = 2; i + 1 < nbytes; i += 2) {
509 uc = 0x10000 + 0x400 * (
hi - 1) + (uc - 0xdc00);
516 hi = uc - 0xd800 + 1;
534 if (bf[0] == 0xff && bf[1] == 0xfe && bf[2] == 0 && bf[3] == 0)
536 else if (bf[0] == 0 && bf[1] == 0 && bf[2] == 0xfe && bf[3] == 0xff)
543 for (i = 4; i + 3 < nbytes; i += 4) {
557 if (ubf[*ulen - 1] == 0xfffe)
559 if (ubf[*ulen - 1] < 128 &&
594 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
595 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
596128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
597144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
598' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213,
'.',
'<',
'(',
'+',
'|',
599'&', 169, 170, 171, 172, 173, 174, 175, 176, 177,
'!',
'$',
'*',
')',
';',
'~',
600'-',
'/', 178, 179, 180, 181, 182, 183, 184, 185, 203,
',',
'%',
'_',
'>',
'?',
601186, 187, 188, 189, 190, 191, 192, 193, 194,
'`',
':',
'#',
'@',
'\'',
'=',
'"',
602195,
'a',
'b',
'c',
'd',
'e',
'f',
'g',
'h',
'i', 196, 197, 198, 199, 200, 201,
603202,
'j',
'k',
'l',
'm',
'n',
'o',
'p',
'q',
'r',
'^', 204, 205, 206, 207, 208,
604209, 229,
's',
't',
'u',
'v',
'w',
'x',
'y',
'z', 210, 211, 212,
'[', 214, 215,
605216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,
']', 230, 231,
606'{',
'A',
'B',
'C',
'D',
'E',
'F',
'G',
'H',
'I', 232, 233, 234, 235, 236, 237,
607'}',
'J',
'K',
'L',
'M',
'N',
'O',
'P',
'Q',
'R', 238, 239, 240, 241, 242, 243,
608'\\',159,
'S',
'T',
'U',
'V',
'W',
'X',
'Y',
'Z', 244, 245, 246, 247, 248, 249,
609'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9', 250, 251, 252, 253, 254, 255
6280x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
6290x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
6300x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
6310x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
6320x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
6330x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
6340x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
6350xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
6360xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
6370xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
6380xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
6390xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
6400x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
6410x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
6420x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
6430x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
655 for (i = 0; i < nbytes; i++) {
zend_ffi_ctype_name_buf buf
file_private int looks_ascii(const unsigned char *, size_t, file_unichar_t *, size_t *)
file_private int looks_utf8_with_BOM(const unsigned char *, size_t, file_unichar_t *, size_t *)
file_private char text_chars[256]
file_protected int file_looks_utf8(const unsigned char *buf, size_t nbytes, file_unichar_t *ubuf, size_t *ulen)
#define LOOKS(NAME, COND)
file_private int looks_ucs16(const unsigned char *, size_t, file_unichar_t *, size_t *)
file_protected int file_encoding(struct magic_set *ms, const struct buffer *b, file_unichar_t **ubuf, size_t *ulen, const char **code, const char **code_mime, const char **type)
file_private int looks_extended(const unsigned char *, size_t, file_unichar_t *, size_t *)
file_private int looks_latin1(const unsigned char *, size_t, file_unichar_t *, size_t *)
file_private unsigned char ebcdic_to_ascii[]
file_private void from_ebcdic(const unsigned char *, size_t, unsigned char *)
file_private int looks_ucs32(const unsigned char *, size_t, file_unichar_t *, size_t *)
struct accept_range accept_ranges[16]
file_private int looks_utf7(const unsigned char *, size_t, file_unichar_t *, size_t *)
unsigned long file_unichar_t
file_protected void file_oomem(struct magic_set *, size_t)
#define ecalloc(nmemb, size)