46 echo
"Usage: php ucgendata.php ./datadir\n";
47 echo
"./datadir must contain:\n";
48 echo
"UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt, DerivedCoreProperties.txt, and EastAsianWidth.txt\n";
62 echo
"File $file does not exist.\n";
76$eawFile = __DIR__ .
"/../libmbfl/mbfl/eaw_table.h";
103 "Mn",
"Mc",
"Me",
"Nd",
"Nl",
"No",
104 "Zs",
"Zl",
"Zp",
"Cs",
"Co",
"Cn",
105 "Lu",
"Ll",
"Lt",
"Lm",
"Lo",
"Sm",
106 "Sc",
"Sk",
"So",
"L",
"R",
"EN",
107 "ES",
"ET",
"AN",
"CS",
"B",
"S",
109 "C",
"P",
"Cased",
"Case_Ignorable"
111 $this->numProps =
count($this->propIndexes);
113 $this->propRanges =
array_fill(0, $this->numProps, []);
120 $this->extraCaseData = [];
125 if (
in_array($prop, [
"BN",
"NSM",
"PDF",
"LRE",
"LRO",
"RLE",
"RLO",
"LRI",
"RLI",
"FSI",
"PDI"])) {
135 if (
in_array($prop, [
"Pc",
"Pd",
"Ps",
"Pe",
"Po",
"Pi",
"Pf"])) {
139 if (
in_array($prop, [
"Cc",
"Cf"])) {
143 if (!isset($this->propIndexes[$prop])) {
144 throw new Exception(
"Unknown property $prop");
147 return $this->propIndexes[$prop];
150 public function addProp(
int $code,
string $prop) {
151 $propIdx = self::propToIndex($prop);
154 $ranges = $this->propRanges[$propIdx];
155 if (!empty($ranges)) {
156 $lastRange = $ranges[
count($ranges) - 1];
157 if ($code === $lastRange->end + 1) {
163 $this->propRanges[$propIdx][] =
new Range($code, $code);
166 public function addPropRange(
int $startCode,
int $endCode,
string $prop) {
167 $propIdx = self::propToIndex($prop);
168 $this->propRanges[$propIdx][] =
new Range($startCode, $endCode);
172 $this->caseMaps[$case][$origCode] = $mappedCode;
178 return $r1->start <=> $r2->start;
181 $lastRange =
new Range(-1, -1);
183 foreach ($ranges as $range) {
184 if ($lastRange->end == -1) {
186 }
else if ($range->start == $lastRange->end + 1) {
187 $lastRange->end = $range->end;
188 }
else if ($range->start > $lastRange->end + 1) {
189 $newRanges[] = $lastRange;
193 "Overlapping ranges [%x, %x] and [%x, %x]",
194 $lastRange->start, $lastRange->end,
195 $range->start, $range->end
199 if ($lastRange->end != -1) {
200 $newRanges[] = $lastRange;
206 foreach ($this->propRanges as &$ranges) {
214 foreach ($lines as $line) {
216 if (
false !== $hashPos =
strpos($line,
'#')) {
217 $line =
substr($line, 0, $hashPos);
233 foreach ($lines as $fields) {
234 if (
count($fields) != 15) {
235 throw new Exception(
"Line does not contain 15 fields");
238 $code =
intval($fields[0], 16);
245 if (
$name[0] ===
'<' &&
$name !==
'<control>') {
248 $nextFields = $lines->current();
249 $nextCode =
intval($nextFields[0], 16);
251 $generalCategory = $fields[2];
252 $data->addPropRange($code, $nextCode, $generalCategory);
254 $bidiClass = $fields[4];
255 $data->addPropRange($code, $nextCode, $bidiClass);
259 $generalCategory = $fields[2];
260 $data->addProp($code, $generalCategory);
262 $bidiClass = $fields[4];
263 $data->addProp($code, $bidiClass);
265 $upperCase =
intval($fields[12], 16);
266 $lowerCase =
intval($fields[13], 16);
267 $titleCase =
intval($fields[14], 16) ?: $upperCase;
269 $data->addCaseMapping(
'upper', $code, $upperCase);
272 $data->addCaseMapping(
'lower', $code, $lowerCase);
275 $data->addCaseMapping(
'title', $code, $titleCase);
282 foreach (
explode(
' ', $strCodes) as $strCode) {
283 $codes[] =
intval($strCode, 16);
290 if (
count($fields) != 4) {
291 throw new Exception(
"Line does not contain 4 fields");
294 $code =
intval($fields[0], 16);
295 $status = $fields[1];
296 if ($status ==
'T') {
301 if ($status ==
'C' || $status ==
'S') {
302 $foldCode =
intval($fields[2], 16);
303 if (!isset(
$data->caseMaps[
'fold'][$code])) {
304 $data->addCaseMapping(
'fold', $code, $foldCode);
308 $data->caseMaps[
'fold'][$code][0] = $foldCode;
310 }
else if ($status ==
'F') {
312 $existingFoldCode =
$data->caseMaps[
'fold'][$code] ?? $code;
321 $simpleCaseCode =
$data->caseMaps[$type][$code] ?? $code;
322 if (
count($caseCodes) == 1) {
323 if ($caseCodes[0] != $simpleCaseCode) {
324 throw new Exception(
"Simple case code in special casing does not match");
329 if ($type ==
'title' && $code == $caseCodes[0]
330 && (
$data->caseMaps[
'upper'][$code] ?? $code) != $code) {
331 $data->caseMaps[
'title'][$code] = $code;
336 if (
count($caseCodes) > 3) {
337 throw new Exception(
"Special case mapping with more than 3 code points");
345 if (
count($fields) != 5 &&
count($fields) != 6) {
346 throw new Exception(
"Line does not contain 5 or 6 fields");
349 $code =
intval($fields[0], 16);
370 $fieldCount =
count($fields);
371 if ($fieldCount != 2 && $fieldCount !== 3) {
372 throw new Exception(
"Line does not contain 2 or 3 fields");
375 $usedProperties = [
'Cased',
'Case_Ignorable'];
376 if (isset($fields[2]) &&
in_array($fields[2], $usedProperties,
true)) {
377 $property = $fields[2];
383 $property = $fields[1];
387 $range =
explode(
'..', $fields[0]);
388 if (
count($range) == 2) {
390 }
else if (
count($range) == 1) {
402 if ($fields[1] ==
'W' || $fields[1] ==
'F') {
403 if ($dotsPos =
strpos($fields[0],
'..')) {
404 $startCode =
intval(
substr($fields[0], 0, $dotsPos), 16);
405 $endCode =
intval(
substr($fields[0], $dotsPos + 2), 16);
407 if (!empty($wideRanges)) {
408 $lastRange = $wideRanges[
count($wideRanges) - 1];
409 if ($startCode == $lastRange->end + 1) {
410 $lastRange->end = $endCode;
415 $wideRanges[] =
new Range($startCode, $endCode);
417 $code =
intval($fields[0], 16);
419 if (!empty($wideRanges)) {
420 $lastRange = $wideRanges[
count($wideRanges) - 1];
421 if ($code == $lastRange->end + 1) {
427 $wideRanges[] =
new Range($code, $code);
435function formatArray(array $values,
int $width,
string $format) : string {
439 for ($i = 0; $i < $c; $i++) {
444 $result .= $i % $width == 0 ?
"\n\t" :
" ";
461 $data->compactPropRanges();
465 foreach (
$data->propRanges as $ranges) {
466 $num =
count($ranges);
467 $propOffsets[] = $idx;
472 $propOffsets[] = $idx;
477 while (
count($propOffsets) % 4 != 0) {
484 $result .=
"static const unsigned short _ucprop_size = $data->numProps;\n\n";
485 $result .=
"static const unsigned short _ucprop_offsets[] = {";
490 foreach (
$data->propRanges as $ranges) {
491 foreach ($ranges as $range) {
492 $values[] = $range->start;
493 $values[] = $range->end;
497 $result .=
"static const unsigned int _ucprop_ranges[] = {";
505 foreach (
$array as $arr) {
506 foreach ($arr as $v) {
515 foreach (
$data->caseMaps[
'title'] as $code => $titleCode) {
516 if ($titleCode == (
$data->caseMaps[
'upper'][$code] ?? $code)) {
517 unset(
$data->caseMaps[
'title'][$code]);
523 foreach (
$data->caseMaps as $type => $caseMap) {
524 foreach ($caseMap as $code => $caseCode) {
527 $len =
count($caseCode) - 1;
529 $data->caseMaps[$type][$code] = ($len << 24) | $idx;
531 foreach ($caseCode as $c) {
532 $data->extraCaseData[] = $c;
542 echo
"$name: n=",
count($table),
", g=",
count($gTable),
"\n";
545 $result .=
"static const unsigned {$prefix}_g_size = " .
count($gTable) .
";\n";
546 $result .=
"static const short {$prefix}_g[] = {";
549 $result .=
"static const unsigned {$prefix}_table_size = " .
count($table) .
";\n";
550 $result .=
"static const unsigned {$prefix}_table[] = {";
564 $result .=
"static const unsigned _uccase_extra_table[] = {";
602 $x = ((
$x >> 16) ^
$x) * 0x45d9f3b;
603 return $x & 0xffffffff;
612 foreach (
$map as $k => $v) {
614 $buckets[$h][] = [$k, $v];
618 usort($buckets,
function ($b1, $b2) {
622 foreach ($buckets as $bucket) {
623 $collisions =
count($bucket);
624 if ($collisions <= 1) {
632 while ($i < $collisions) {
637 list($k) = $bucket[$i];
638 $slot =
hashInt($d, $k) % $tableSize;
639 if (isset($table[$slot]) || isset($used[$slot])) {
649 $g =
hashInt(0, $bucket[0][0]) % $gSize;
651 foreach ($bucket as $elem) {
652 $table[
hashInt($d, $elem[0]) % $tableSize] = $elem;
657 for ($i = 0; $i < $tableSize; $i++) {
658 if (!isset($table[$i])) {
665 foreach ($buckets as $bucket) {
666 if (
count($bucket) != 1) {
671 $slot = $freeSlots[$freeIdx++];
672 $table[$slot] = $elem;
674 $g =
hashInt(0, $elem[0]) % $gSize;
675 $gTable[$g] = -$slot;
681 return [$gTable, $table];
687 for ($lambda = 5.0;; $lambda -= 0.5) {
690 if (!empty($tmpMph)) {
700 if (!empty($tmpMph)) {
730 $result .=
"\n#define FIRST_DOUBLEWIDTH_CODEPOINT 0x" .
dechex($wideRanges[0]->
start) .
"\n\n";
736} mbfl_eaw_table[] = {
740 foreach ($wideRanges as $range) {
741 $startCode =
dechex($range->start);
742 $endCode =
dechex($range->end);
743 $result .=
"\t{ 0x{$startCode}, 0x{$endCode} },\n";
array_fill(int $start_index, int $count, mixed $value)
trim(string $string, string $characters=" \n\r\t\v\0")
in_array(mixed $needle, array $haystack, bool $strict=false)
file_exists(string $filename)
file_get_contents(string $filename, bool $use_include_path=false, $context=null, int $offset=0, ?int $length=null)
array_merge(array ... $arrays)
explode(string $separator, string $string, int $limit=PHP_INT_MAX)
intval(mixed $value, int $base=10)
strpos(string $haystack, string $needle, int $offset=0)
file_put_contents(string $filename, mixed $data, int $flags=0, $context=null)
array_map(?callable $callback, array $array, array ... $arrays)
usort(array &$array, callable $callback)
count(Countable|array $value, int $mode=COUNT_NORMAL)
ksort(array &$array, int $flags=SORT_REGULAR)
assert(mixed $assertion, Throwable|string|null $description=null)
substr(string $string, int $offset, ?int $length=null)
__construct(int $start, int $end)
addCaseMapping(string $case, int $origCode, int $mappedCode)
propToIndex(string $prop)
compactRangeArray(array $ranges)
addPropRange(int $startCode, int $endCode, string $prop)
addProp(int $code, string $prop)
foreach( $chunks as $chunk)
for($i=0; $i< 0x100;++$i) $map[chr(0)]
foreach(explode("\n", $input) as $line) $result
sprintf("0x%X", $numelems)
if(! $pass2) elseif( $pass2==1) elseif( $pass2==2)
foreach($dp as $el) foreach( $dp as $el) if( $pass2< 2) echo ""
unsigned const char * end
generateData(UnicodeData $data)
prepareCaseData(UnicodeData $data)
parseUnicodeData(UnicodeData $data, string $input)
formatIntArray(array $values, int $width)
formatShortHexArray(array $values, int $width)
generatePropData(UnicodeData $data)
$derivedCorePropertiesFile
parseSpecialCasing(UnicodeData $data, string $input)
generateMPH(array $map, bool $fast)
formatArray(array $values, int $width, string $format)
parseCaseFolding(UnicodeData $data, string $input)
addSpecialCasing(UnicodeData $data, string $type, int $code, array $caseCodes)
parseCodes(string $strCodes)
parseEastAsianWidth(string $input)
formatShortDecArray(array $values, int $width)
generateEastAsianWidthData(array $wideRanges)
foreach($files as $file) $outputFile
tryGenerateMPH(array $map, int $gSize)
parseDerivedCoreProperties(UnicodeData $data, string $input)
generateCaseData(UnicodeData $data)
generateCaseMPH(string $name, array $map)
parseDataFile(string $input)
error_reporting(?int $error_level=null)
ZEND_API void(ZEND_FASTCALL *zend_touch_vm_stack_data)(void *vm_stack_data)
function(EX_VAR(opline->result.var))