457 lines
10 KiB
C
457 lines
10 KiB
C
/* See LICENSE file for copyright and license details. */
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "util.h"
|
|
|
|
#define FILE_EAW "data/EastAsianWidth.txt"
|
|
#define FILE_EMOJI "data/emoji-data.txt"
|
|
#define FILE_LINE "data/LineBreak.txt"
|
|
|
|
static const struct property_spec line_break_property[] = {
|
|
{
|
|
.enumname = "AL",
|
|
.file = FILE_LINE,
|
|
.ucdname = "AL",
|
|
},
|
|
/*
|
|
* Both extended pictographic and cn are large classes,
|
|
* but we are only interested in their intersection for LB30b,
|
|
* so we have the following two temporary classes. At first
|
|
* the extpict-class is filled, then the cn-class, which leads
|
|
* to conflicts (that we handle by putting them in the "proper"
|
|
* class BOTH_CN_EXTPICT). We make use of the fact that there
|
|
* is no intersection between AL and Cn.
|
|
*
|
|
* Any consecutive conflicts are permitted to overwrite
|
|
* TMP_EXTENDED_PICTOGRAPHIC and TMP_CN, because we don't need
|
|
* them, and in the final postprocessing we "reset" all
|
|
* remaining matches (that then didn't fit any of the other
|
|
* classes) to the generic class AL.
|
|
*/
|
|
{
|
|
.enumname = "TMP_CN",
|
|
.file = FILE_LINE,
|
|
.ucdname = "Cn",
|
|
},
|
|
{
|
|
.enumname = "TMP_EXTENDED_PICTOGRAPHIC",
|
|
.file = FILE_EMOJI,
|
|
.ucdname = "Extended_Pictographic",
|
|
},
|
|
/* end of special block */
|
|
{
|
|
.enumname = "B2",
|
|
.file = FILE_LINE,
|
|
.ucdname = "B2",
|
|
},
|
|
{
|
|
.enumname = "BA",
|
|
.file = FILE_LINE,
|
|
.ucdname = "BA",
|
|
},
|
|
{
|
|
.enumname = "BB",
|
|
.file = FILE_LINE,
|
|
.ucdname = "BB",
|
|
},
|
|
{
|
|
.enumname = "BK",
|
|
.file = FILE_LINE,
|
|
.ucdname = "BK",
|
|
},
|
|
{
|
|
.enumname = "BOTH_CN_EXTPICT",
|
|
.file = NULL,
|
|
.ucdname = NULL,
|
|
},
|
|
{
|
|
.enumname = "CB",
|
|
.file = FILE_LINE,
|
|
.ucdname = "CB",
|
|
},
|
|
{
|
|
.enumname = "CL",
|
|
.file = FILE_LINE,
|
|
.ucdname = "CL",
|
|
},
|
|
{
|
|
.enumname = "CM",
|
|
.file = FILE_LINE,
|
|
.ucdname = "CM",
|
|
},
|
|
{
|
|
.enumname = "CP_WITHOUT_EAW_HWF",
|
|
.file = FILE_LINE,
|
|
.ucdname = "CP",
|
|
},
|
|
{
|
|
.enumname = "CP_WITH_EAW_HWF",
|
|
.file = NULL,
|
|
.ucdname = NULL,
|
|
},
|
|
{
|
|
.enumname = "CR",
|
|
.file = FILE_LINE,
|
|
.ucdname = "CR",
|
|
},
|
|
{
|
|
.enumname = "EB",
|
|
.file = FILE_LINE,
|
|
.ucdname = "EB",
|
|
},
|
|
{
|
|
.enumname = "EM",
|
|
.file = FILE_LINE,
|
|
.ucdname = "EM",
|
|
},
|
|
{
|
|
.enumname = "EX",
|
|
.file = FILE_LINE,
|
|
.ucdname = "EX",
|
|
},
|
|
{
|
|
.enumname = "GL",
|
|
.file = FILE_LINE,
|
|
.ucdname = "GL",
|
|
},
|
|
{
|
|
.enumname = "H2",
|
|
.file = FILE_LINE,
|
|
.ucdname = "H2",
|
|
},
|
|
{
|
|
.enumname = "H3",
|
|
.file = FILE_LINE,
|
|
.ucdname = "H3",
|
|
},
|
|
{
|
|
.enumname = "HL",
|
|
.file = FILE_LINE,
|
|
.ucdname = "HL",
|
|
},
|
|
{
|
|
.enumname = "HY",
|
|
.file = FILE_LINE,
|
|
.ucdname = "HY",
|
|
},
|
|
{
|
|
.enumname = "ID",
|
|
.file = FILE_LINE,
|
|
.ucdname = "ID",
|
|
},
|
|
{
|
|
.enumname = "IN",
|
|
.file = FILE_LINE,
|
|
.ucdname = "IN",
|
|
},
|
|
{
|
|
.enumname = "IS",
|
|
.file = FILE_LINE,
|
|
.ucdname = "IS",
|
|
},
|
|
{
|
|
.enumname = "JL",
|
|
.file = FILE_LINE,
|
|
.ucdname = "JL",
|
|
},
|
|
{
|
|
.enumname = "JT",
|
|
.file = FILE_LINE,
|
|
.ucdname = "JT",
|
|
},
|
|
{
|
|
.enumname = "JV",
|
|
.file = FILE_LINE,
|
|
.ucdname = "JV",
|
|
},
|
|
{
|
|
.enumname = "LF",
|
|
.file = FILE_LINE,
|
|
.ucdname = "LF",
|
|
},
|
|
{
|
|
.enumname = "NL",
|
|
.file = FILE_LINE,
|
|
.ucdname = "NL",
|
|
},
|
|
{
|
|
.enumname = "NS",
|
|
.file = FILE_LINE,
|
|
.ucdname = "NS",
|
|
},
|
|
{
|
|
.enumname = "NU",
|
|
.file = FILE_LINE,
|
|
.ucdname = "NU",
|
|
},
|
|
{
|
|
.enumname = "OP_WITHOUT_EAW_HWF",
|
|
.file = FILE_LINE,
|
|
.ucdname = "OP",
|
|
},
|
|
{
|
|
.enumname = "OP_WITH_EAW_HWF",
|
|
.file = NULL,
|
|
.ucdname = NULL,
|
|
},
|
|
{
|
|
.enumname = "PO",
|
|
.file = FILE_LINE,
|
|
.ucdname = "PO",
|
|
},
|
|
{
|
|
.enumname = "PR",
|
|
.file = FILE_LINE,
|
|
.ucdname = "PR",
|
|
},
|
|
{
|
|
.enumname = "QU",
|
|
.file = FILE_LINE,
|
|
.ucdname = "QU",
|
|
},
|
|
{
|
|
.enumname = "RI",
|
|
.file = FILE_LINE,
|
|
.ucdname = "RI",
|
|
},
|
|
{
|
|
.enumname = "SP",
|
|
.file = FILE_LINE,
|
|
.ucdname = "SP",
|
|
},
|
|
{
|
|
.enumname = "SY",
|
|
.file = FILE_LINE,
|
|
.ucdname = "SY",
|
|
},
|
|
{
|
|
.enumname = "WJ",
|
|
.file = FILE_LINE,
|
|
.ucdname = "WJ",
|
|
},
|
|
{
|
|
.enumname = "ZW",
|
|
.file = FILE_LINE,
|
|
.ucdname = "ZW",
|
|
},
|
|
{
|
|
.enumname = "ZWJ",
|
|
.file = FILE_LINE,
|
|
.ucdname = "ZWJ",
|
|
},
|
|
{
|
|
.enumname = "TMP_AI",
|
|
.file = FILE_LINE,
|
|
.ucdname = "AI",
|
|
},
|
|
{
|
|
.enumname = "TMP_CJ",
|
|
.file = FILE_LINE,
|
|
.ucdname = "CJ",
|
|
},
|
|
{
|
|
.enumname = "TMP_XX",
|
|
.file = NULL,
|
|
.ucdname = NULL,
|
|
},
|
|
{
|
|
.enumname = "TMP_MN",
|
|
.file = FILE_LINE,
|
|
.ucdname = "Mn",
|
|
},
|
|
{
|
|
.enumname = "TMP_MC",
|
|
.file = FILE_LINE,
|
|
.ucdname = "Mc",
|
|
},
|
|
{
|
|
.enumname = "TMP_SA_WITHOUT_MN_OR_MC",
|
|
.file = FILE_LINE,
|
|
.ucdname = "SA",
|
|
},
|
|
{
|
|
.enumname = "TMP_SA_WITH_MN_OR_MC",
|
|
.file = FILE_LINE,
|
|
.ucdname = "SA",
|
|
},
|
|
{
|
|
.enumname = "TMP_SG",
|
|
.file = FILE_LINE,
|
|
.ucdname = "SG",
|
|
},
|
|
{
|
|
.enumname = "TMP_EAW_H",
|
|
.file = FILE_EAW,
|
|
.ucdname = "H",
|
|
},
|
|
{
|
|
.enumname = "TMP_EAW_W",
|
|
.file = FILE_EAW,
|
|
.ucdname = "W",
|
|
},
|
|
{
|
|
.enumname = "TMP_EAW_F",
|
|
.file = FILE_EAW,
|
|
.ucdname = "F",
|
|
},
|
|
};
|
|
|
|
static uint_least8_t
|
|
handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
|
|
{
|
|
uint_least8_t result = prop2;
|
|
char *target = NULL;
|
|
|
|
(void)cp;
|
|
|
|
if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") ||
|
|
!strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") ||
|
|
!strcmp(line_break_property[prop1].enumname, "TMP_EAW_F")) ||
|
|
(!strcmp(line_break_property[prop2].enumname, "TMP_EAW_H") ||
|
|
!strcmp(line_break_property[prop2].enumname, "TMP_EAW_W") ||
|
|
!strcmp(line_break_property[prop2].enumname, "TMP_EAW_F"))) {
|
|
if (!strcmp(line_break_property[prop1].enumname, "CP_WITHOUT_EAW_HWF") ||
|
|
!strcmp(line_break_property[prop2].enumname, "CP_WITHOUT_EAW_HWF")) {
|
|
target = "CP_WITH_EAW_HWF";
|
|
} else if (!strcmp(line_break_property[prop1].enumname, "OP_WITHOUT_EAW_HWF") ||
|
|
!strcmp(line_break_property[prop2].enumname, "OP_WITHOUT_EAW_HWF")) {
|
|
target = "OP_WITH_EAW_HWF";
|
|
} else {
|
|
/* ignore EAW for the rest */
|
|
if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") ||
|
|
!strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") ||
|
|
!strcmp(line_break_property[prop1].enumname, "TMP_EAW_F"))) {
|
|
result = prop2;
|
|
} else {
|
|
result = prop1;
|
|
}
|
|
}
|
|
} else if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") ||
|
|
!strcmp(line_break_property[prop1].enumname, "TMP_MC")) ||
|
|
(!strcmp(line_break_property[prop2].enumname, "TMP_MN") ||
|
|
!strcmp(line_break_property[prop2].enumname, "TMP_MC"))) {
|
|
if (!strcmp(line_break_property[prop1].enumname, "SA_WITHOUT_MN_OR_MC") ||
|
|
!strcmp(line_break_property[prop2].enumname, "SA_WITHOUT_MN_OR_MC")) {
|
|
target = "SA_WITH_MN_OR_MC";
|
|
} else {
|
|
/* ignore Mn and Mc for the rest */
|
|
if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") ||
|
|
!strcmp(line_break_property[prop1].enumname, "TMP_MC"))) {
|
|
result = prop2;
|
|
} else {
|
|
result = prop1;
|
|
}
|
|
}
|
|
} else if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
|
|
!strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
|
|
if (!strcmp(line_break_property[prop1].enumname, "TMP_EXTENDED_PICTOGRAPHIC") ||
|
|
!strcmp(line_break_property[prop2].enumname, "TMP_EXTENDED_PICTOGRAPHIC")) {
|
|
target = "BOTH_CN_EXTPICT";
|
|
} else {
|
|
/* ignore Cn for all the other properties */
|
|
if (!strcmp(line_break_property[prop1].enumname, "TMP_CN")) {
|
|
result = prop2;
|
|
} else {
|
|
result = prop1;
|
|
}
|
|
}
|
|
} else if (!strcmp(line_break_property[prop1].enumname, "TMP_EXTENDED_PICTOGRAPHIC") ||
|
|
!strcmp(line_break_property[prop2].enumname, "TMP_EXTENDED_PICTOGRAPHIC")) {
|
|
if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
|
|
!strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
|
|
target = "BOTH_CN_EXTPICT";
|
|
} else {
|
|
/* ignore Extended_Pictographic for all the other properties */
|
|
if (!strcmp(line_break_property[prop1].enumname, "TMP_EXTENDED_PICTOGRAPHIC")) {
|
|
result = prop2;
|
|
} else {
|
|
result = prop1;
|
|
}
|
|
}
|
|
} else {
|
|
fprintf(stderr, "handle_conflict: Cannot handle conflict %s <- %s.\n",
|
|
line_break_property[prop1].enumname, line_break_property[prop2].enumname);
|
|
exit(1);
|
|
}
|
|
|
|
if (target) {
|
|
for (result = 0; result < LEN(line_break_property); result++) {
|
|
if (!strcmp(line_break_property[result].enumname,
|
|
target)) {
|
|
break;
|
|
}
|
|
}
|
|
if (result == LEN(line_break_property)) {
|
|
fprintf(stderr, "handle_conflict: Internal error.\n");
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
static uint_least8_t
|
|
post_process(uint_least8_t prop)
|
|
{
|
|
const char *target = NULL;
|
|
uint_least8_t result;
|
|
|
|
/* LB1 */
|
|
if (!strcmp(line_break_property[prop].enumname, "TMP_AI") ||
|
|
!strcmp(line_break_property[prop].enumname, "TMP_SG") ||
|
|
!strcmp(line_break_property[prop].enumname, "TMP_XX")) {
|
|
/* map AI, SG and XX to AL */
|
|
target = "AL";
|
|
} else if (!strcmp(line_break_property[prop].enumname, "TMP_SA_WITH_MN_OR_MC")) {
|
|
/* map SA (with General_Category Mn or Mc) to CM */
|
|
target = "CM";
|
|
} else if (!strcmp(line_break_property[prop].enumname, "TMP_SA_WITHOUT_MN_OR_MC")) {
|
|
/* map SA (without General_Category Mn or Mc) to AL */
|
|
target = "AL";
|
|
} else if (!strcmp(line_break_property[prop].enumname, "TMP_CJ")) {
|
|
/* map CJ to NS */
|
|
target = "NS";
|
|
} else if (!strcmp(line_break_property[prop].enumname, "TMP_CN") ||
|
|
!strcmp(line_break_property[prop].enumname, "TMP_EXTENDED_PICTOGRAPHIC") ||
|
|
!strcmp(line_break_property[prop].enumname, "TMP_MN") ||
|
|
!strcmp(line_break_property[prop].enumname, "TMP_MC") ||
|
|
!strcmp(line_break_property[prop].enumname, "TMP_EAW_H") ||
|
|
!strcmp(line_break_property[prop].enumname, "TMP_EAW_W") ||
|
|
!strcmp(line_break_property[prop].enumname, "TMP_EAW_F")) {
|
|
/* map all the temporary classes "residue" to AL */
|
|
target = "AL";
|
|
}
|
|
|
|
if (target) {
|
|
for (result = 0; result < LEN(line_break_property); result++) {
|
|
if (!strcmp(line_break_property[result].enumname,
|
|
target)) {
|
|
break;
|
|
}
|
|
}
|
|
if (result == LEN(line_break_property)) {
|
|
fprintf(stderr, "handle_conflict: Internal error.\n");
|
|
exit(1);
|
|
}
|
|
|
|
return result;
|
|
} else {
|
|
return prop;
|
|
}
|
|
}
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
(void)argc;
|
|
|
|
properties_generate_break_property(line_break_property,
|
|
LEN(line_break_property),
|
|
handle_conflict, post_process,
|
|
"line_break", argv[0]);
|
|
|
|
return 0;
|
|
}
|