419 lines
9.0 KiB
Bash
419 lines
9.0 KiB
Bash
|
#!/bin/sh
|
||
|
#
|
||
|
# This needs http://unicode.org/Public/UNIDATA/UnicodeData.txt
|
||
|
#
|
||
|
inputfile="$1" # Expect UnicodeData.txt
|
||
|
outfile=archive_string_composition.h
|
||
|
pickout=/tmp/mk_unicode_composition_tbl$$.awk
|
||
|
#################################################################################
|
||
|
#
|
||
|
# Append the file header of "archive_string_composition.h"
|
||
|
#
|
||
|
#################################################################################
|
||
|
append_copyright()
|
||
|
{
|
||
|
cat > ${outfile} <<CR_END
|
||
|
/*-
|
||
|
* Copyright (c) 2011 libarchive Project
|
||
|
* All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions
|
||
|
* are met:
|
||
|
* 1. Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in the
|
||
|
* documentation and/or other materials provided with the distribution.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
|
||
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||
|
* IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*
|
||
|
* \$FreeBSD\$
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
* ATTENTION!
|
||
|
* This file is generated by build/utils/gen_archive_string_composition_h.sh
|
||
|
* from http://unicode.org/Public/UNIDATA/UnicodeData.txt
|
||
|
*
|
||
|
* See also http://unicode.org/report/tr15/
|
||
|
*/
|
||
|
|
||
|
#ifndef __LIBARCHIVE_BUILD
|
||
|
#error This header is only to be used internally to libarchive.
|
||
|
#endif
|
||
|
|
||
|
#ifndef ARCHIVE_STRING_COMPOSITION_H_INCLUDED
|
||
|
#define ARCHIVE_STRING_COMPOSITION_H_INCLUDED
|
||
|
|
||
|
struct unicode_composition_table {
|
||
|
uint32_t cp1;
|
||
|
uint32_t cp2;
|
||
|
uint32_t nfc;
|
||
|
};
|
||
|
|
||
|
CR_END
|
||
|
}
|
||
|
#################################################################################
|
||
|
#
|
||
|
# awk script
|
||
|
#
|
||
|
#################################################################################
|
||
|
cat > ${pickout} <<AWK_END
|
||
|
#
|
||
|
BEGIN {
|
||
|
FS = ";"
|
||
|
min = "";
|
||
|
max = "";
|
||
|
cmd="sort | awk -F ' ' '{printf \"\\\\t{ 0x%s , 0x%s , 0x%s },\\\\n\",\$1,\$2,\$3}'"
|
||
|
print "static const struct unicode_composition_table u_composition_table[] = {"
|
||
|
}
|
||
|
END {
|
||
|
close(cmd)
|
||
|
print "};"
|
||
|
print ""
|
||
|
#
|
||
|
# Output Canonical Combining Class tables used for translating NFD to NFC.
|
||
|
#
|
||
|
printf "#define CANONICAL_CLASS_MIN\\t0x%s\\n", min
|
||
|
printf "#define CANONICAL_CLASS_MAX\\t0x%s\\n", max
|
||
|
print ""
|
||
|
printf "#define IS_DECOMPOSABLE_BLOCK(uc)\\t\\\\\n"
|
||
|
printf "\\t(((uc)>>8) <= 0x%X && u_decomposable_blocks[(uc)>>8])\\n", highnum
|
||
|
printf "static const char u_decomposable_blocks[0x%X+1] = {\\n\\t", highnum
|
||
|
#
|
||
|
# Output blockmap
|
||
|
for (i = 0; i <= highnum; i++) {
|
||
|
if (i != 0 && i % 32 == 0)
|
||
|
printf "\\n\\t"
|
||
|
# Additionally Hangul[11XX(17), AC00(172) - D7FF(215)] is decomposable.
|
||
|
if (blockmap[i] || i == 17 || (i >= 172 && i <= 215))
|
||
|
printf "1,"
|
||
|
else
|
||
|
printf "0,"
|
||
|
}
|
||
|
printf "\\n};\\n\\n"
|
||
|
#
|
||
|
# Output a macro to get a canonical combining class.
|
||
|
#
|
||
|
print "/* Get Canonical Combining Class(CCC). */"
|
||
|
printf "#define CCC(uc)\\t\\\\\n"
|
||
|
printf "\\t(((uc) > 0x%s)?0:\\\\\\n", max
|
||
|
printf "\\tccc_val[ccc_val_index[ccc_index[(uc)>>8]][((uc)>>4)&0x0F]][(uc)&0x0F])\\n"
|
||
|
print ""
|
||
|
#
|
||
|
# Output a canonical combining class value table.
|
||
|
#
|
||
|
midcnt = 0
|
||
|
printf "/* The table of the value of Canonical Cimbining Class */\\n"
|
||
|
print "static const unsigned char ccc_val[][16] = {"
|
||
|
print " /* idx=0: XXXX0 - XXXXF */"
|
||
|
print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
|
||
|
for (h = 0; h <= highnum; h++) {
|
||
|
if (!blockmap[h])
|
||
|
continue;
|
||
|
for (m = 0; m < 16; m++) {
|
||
|
if (!xx_blockmap[h, m])
|
||
|
continue;
|
||
|
midcnt++
|
||
|
printf " /* idx=%d: %03X%1X0 - %03X%1XF */\\n {", midcnt, h, m, h, m
|
||
|
for (l = 0; l < 15; l++) {
|
||
|
printf "%d, ", xxx_blockmap[h, m, l]
|
||
|
}
|
||
|
printf "%d },\n", xxx_blockmap[h, m, 15]
|
||
|
}
|
||
|
}
|
||
|
printf "};\n"
|
||
|
#
|
||
|
# Output the index table of the canonical combining class value table.
|
||
|
#
|
||
|
cnt = 0
|
||
|
midcnt = 0
|
||
|
printf "\\n/* The index table to ccc_val[*][16] */\\n"
|
||
|
print "static const unsigned char ccc_val_index[][16] = {"
|
||
|
print " /* idx=0: XXX00 - XXXFF */"
|
||
|
print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
|
||
|
for (h = 0; h <= highnum; h++) {
|
||
|
if (!blockmap[h])
|
||
|
continue;
|
||
|
cnt++
|
||
|
printf " /* idx=%d: %03X00 - %03XFF */\\n {", cnt, h, h
|
||
|
for (m = 0; m < 16; m++) {
|
||
|
if (m != 0)
|
||
|
printf ","
|
||
|
if (xx_blockmap[h, m]) {
|
||
|
midcnt++
|
||
|
printf "%2d", midcnt
|
||
|
} else
|
||
|
printf " 0"
|
||
|
}
|
||
|
printf " },\\n"
|
||
|
}
|
||
|
printf "};\\n"
|
||
|
#
|
||
|
# Output the index table to the index table of the canonical combining
|
||
|
# class value table.
|
||
|
#
|
||
|
printf "\\n/* The index table to ccc_val_index[*][16] */\\n"
|
||
|
printf "static const unsigned char ccc_index[] = {\\n ", h
|
||
|
cnt = 0
|
||
|
for (h = 0; h <= highnum; h++) {
|
||
|
if (h != 0 && h % 24 == 0)
|
||
|
printf "\\n "
|
||
|
if (blockmap[h]) {
|
||
|
cnt++;
|
||
|
printf "%2d,", cnt
|
||
|
} else
|
||
|
printf " 0,"
|
||
|
}
|
||
|
print "};"
|
||
|
print ""
|
||
|
print "#endif /* ARCHIVE_STRING_COMPOSITION_H_INCLUDED */"
|
||
|
}
|
||
|
#
|
||
|
#
|
||
|
function hextoi(hex)
|
||
|
{
|
||
|
dec = 0
|
||
|
for (i=0; i < length(hex); i++) {
|
||
|
x = substr(hex, i+1, 1)
|
||
|
if (x ~/[0-9]/)
|
||
|
dec = dec * 16 + x;
|
||
|
else if (x == "A")
|
||
|
dec = dec * 16 + 10;
|
||
|
else if (x == "B")
|
||
|
dec = dec * 16 + 11;
|
||
|
else if (x == "C")
|
||
|
dec = dec * 16 + 12;
|
||
|
else if (x == "D")
|
||
|
dec = dec * 16 + 13;
|
||
|
else if (x == "E")
|
||
|
dec = dec * 16 + 14;
|
||
|
else if (x == "F")
|
||
|
dec = dec * 16 + 15;
|
||
|
}
|
||
|
return dec
|
||
|
}
|
||
|
#
|
||
|
# Collect Canonical Combining Class values.
|
||
|
#
|
||
|
\$4 ~/^[0-9A-F]+$/ {
|
||
|
if (\$4 !~/^0$/) {
|
||
|
if (min == "") {
|
||
|
min = \$1
|
||
|
}
|
||
|
max = \$1
|
||
|
high = substr(\$1, 1, length(\$1) -2)
|
||
|
highnum = hextoi(high)
|
||
|
mid = substr(\$1, length(\$1) -1, 1)
|
||
|
midnum = hextoi(mid)
|
||
|
low = substr(\$1, length(\$1), 1)
|
||
|
lownum = hextoi(low)
|
||
|
blockmap[highnum] = 1
|
||
|
xx_blockmap[highnum, midnum] = 1
|
||
|
xxx_blockmap[highnum, midnum, lownum] = \$4
|
||
|
}
|
||
|
}
|
||
|
#
|
||
|
# Following code points are not decomposed in MAC OS.
|
||
|
# U+2000 - U+2FFF
|
||
|
# U+F900 - U+FAFF
|
||
|
# U+2F800 - U+2FAFF
|
||
|
#
|
||
|
#\$1 ~/^2[0-9A-F][0-9A-F][0-9A-F]\$/ {
|
||
|
# next
|
||
|
#}
|
||
|
#\$1 ~/^F[9A][0-9A-F][0-9A-F]\$/ {
|
||
|
# next
|
||
|
#}
|
||
|
#\$1 ~/^2F[89A][0-9A-F][0-9A-F]\$/ {
|
||
|
# next
|
||
|
#}
|
||
|
#
|
||
|
# Exclusion code points specified by
|
||
|
# http://unicode.org/Public/UNIDATA/CompositionExclusions.txt
|
||
|
##
|
||
|
# 1. Script Specifices
|
||
|
##
|
||
|
\$1 ~/^095[89ABCDEF]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^09D[CDF]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^0A3[36]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^0A5[9ABE]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^0B5[CD]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^0F4[3D]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^0F5[27C]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^0F69\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^0F7[68]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^0F9[3D]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^0FA[27C]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^0FB9\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^FB1[DF]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^FB2[ABCDEF]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^FB3[012345689ABCE]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^FB4[01346789ABCDE]\$/ {
|
||
|
next
|
||
|
}
|
||
|
##
|
||
|
# 2. Post Composition Version precomposed characters
|
||
|
##
|
||
|
\$1 ~/^2ADC\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^1D15[EF]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^1D16[01234]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^1D1B[BCDEF]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^1D1C0\$/ {
|
||
|
next
|
||
|
}
|
||
|
##
|
||
|
# 3. Singleton Decompositions
|
||
|
##
|
||
|
\$1 ~/^034[01]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^037[4E]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^0387\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^1F7[13579BD]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^1FB[BE]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^1FC[9B]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^1FD[3B]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^1FE[3BEF]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^1FF[9BD]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^200[01]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^212[6AB]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^232[9A]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^F9[0-9A-F][0-9A-F]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^FA0[0-9A-D]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^FA1[025-9A-E]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^FA2[0256A-D]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^FA[3-5][0-9A-F]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^FA6[0-9A-D]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^FA[7-9A-C][0-9A-F]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^FAD[0-9]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^2F[89][0-9A-F][0-9A-F]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^2FA0[0-9A-F]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^2FA1[0-9A-D]\$/ {
|
||
|
next
|
||
|
}
|
||
|
##
|
||
|
# 4. Non-Starter Decompositions
|
||
|
##
|
||
|
\$1 ~/^0344\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^0F7[35]\$/ {
|
||
|
next
|
||
|
}
|
||
|
\$1 ~/^0F81\$/ {
|
||
|
next
|
||
|
}
|
||
|
#
|
||
|
# Output combinations for NFD ==> NFC.
|
||
|
#
|
||
|
\$6 ~/^[0-9A-F]+ [0-9A-F]+\$/ {
|
||
|
split(\$6, cp, " ")
|
||
|
if (length(\$1) == 4)
|
||
|
print "0"cp[1], "0"cp[2], "0"\$1 | cmd
|
||
|
else
|
||
|
print cp[1], cp[2], \$1 | cmd
|
||
|
}
|
||
|
AWK_END
|
||
|
#################################################################################
|
||
|
#
|
||
|
# Run awk a script.
|
||
|
#
|
||
|
#################################################################################
|
||
|
append_copyright
|
||
|
awk -f ${pickout} ${inputfile} >> ${outfile}
|
||
|
#
|
||
|
# Remove awk the script.
|
||
|
rm ${pickout}
|