Initial Commit
This commit is contained in:
470
libs/libgrapheme-2.0.2/src/case.c
Normal file
470
libs/libgrapheme-2.0.2/src/case.c
Normal file
@@ -0,0 +1,470 @@
|
||||
/* See LICENSE file for copyright and license details. */
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "../grapheme.h"
|
||||
#include "../gen/case.h"
|
||||
#include "util.h"
|
||||
|
||||
static inline enum case_property
|
||||
get_case_property(uint_least32_t cp)
|
||||
{
|
||||
if (likely(cp <= UINT32_C(0x10FFFF))) {
|
||||
return (enum case_property)
|
||||
case_minor[case_major[cp >> 8] + (cp & 0xFF)];
|
||||
} else {
|
||||
return CASE_PROP_OTHER;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int_least32_t
|
||||
get_case_offset(uint_least32_t cp, const uint_least16_t *major,
|
||||
const int_least32_t *minor)
|
||||
{
|
||||
if (likely(cp <= UINT32_C(0x10FFFF))) {
|
||||
/*
|
||||
* this value might be larger than or equal to 0x110000
|
||||
* for the special-case-mapping. This needs to be handled
|
||||
* separately
|
||||
*/
|
||||
return minor[major[cp >> 8] + (cp & 0xFF)];
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline size_t
|
||||
to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
|
||||
uint_least8_t final_sigma_level, const uint_least16_t *major,
|
||||
const int_least32_t *minor, const struct special_case *sc)
|
||||
{
|
||||
HERODOTUS_READER tmp;
|
||||
enum case_property prop;
|
||||
enum herodotus_status s;
|
||||
size_t off, i;
|
||||
uint_least32_t cp, tmp_cp;
|
||||
int_least32_t map;
|
||||
|
||||
for (; herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;) {
|
||||
if (sc == lower_special) {
|
||||
/*
|
||||
* For the special Final_Sigma-rule (see SpecialCasing.txt),
|
||||
* which is the only non-localized case-dependent rule,
|
||||
* we apply a different mapping when a sigma is at the
|
||||
* end of a word.
|
||||
*
|
||||
* Before: cased case-ignorable*
|
||||
* After: not(case-ignorable* cased)
|
||||
*
|
||||
* We check the after-condition on demand, but the before-
|
||||
* condition is best checked using the "level"-heuristic
|
||||
* also used in the sentence and line breaking-implementations.
|
||||
*/
|
||||
if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL LETTER SIGMA */
|
||||
(final_sigma_level == 1 ||
|
||||
final_sigma_level == 2)) {
|
||||
/*
|
||||
* check succeeding characters by first skipping
|
||||
* all case-ignorable characters and then checking
|
||||
* if the succeeding character is cased, invalidating
|
||||
* the after-condition
|
||||
*/
|
||||
herodotus_reader_copy(r, &tmp);
|
||||
for (prop = NUM_CASE_PROPS;
|
||||
(s = herodotus_read_codepoint(&tmp, true, &tmp_cp)) ==
|
||||
HERODOTUS_STATUS_SUCCESS; ) {
|
||||
prop = get_case_property(tmp_cp);
|
||||
|
||||
if (prop != CASE_PROP_CASE_IGNORABLE &&
|
||||
prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Now prop is something other than case-ignorable or
|
||||
* the source-string ended.
|
||||
* If it is something other than cased, we know
|
||||
* that the after-condition holds
|
||||
*/
|
||||
if (s != HERODOTUS_STATUS_SUCCESS ||
|
||||
(prop != CASE_PROP_CASED &&
|
||||
prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
|
||||
/*
|
||||
* write GREEK SMALL LETTER FINAL SIGMA to
|
||||
* destination
|
||||
*/
|
||||
herodotus_write_codepoint(w, UINT32_C(0x03C2));
|
||||
|
||||
/* reset Final_Sigma-state and continue */
|
||||
final_sigma_level = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* update state */
|
||||
prop = get_case_property(cp);
|
||||
if ((final_sigma_level == 0 ||
|
||||
final_sigma_level == 1) &&
|
||||
(prop == CASE_PROP_CASED ||
|
||||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
|
||||
/* sequence has begun */
|
||||
final_sigma_level = 1;
|
||||
} else if ((final_sigma_level == 1 ||
|
||||
final_sigma_level == 2) &&
|
||||
(prop == CASE_PROP_CASE_IGNORABLE ||
|
||||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
|
||||
/* case-ignorable sequence begins or continued */
|
||||
final_sigma_level = 2;
|
||||
} else {
|
||||
/* sequence broke */
|
||||
final_sigma_level = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* get and handle case mapping */
|
||||
if (unlikely((map = get_case_offset(cp, major, minor)) >=
|
||||
INT32_C(0x110000))) {
|
||||
/* we have a special case and the offset in the sc-array
|
||||
* is the difference to 0x110000*/
|
||||
off = (uint_least32_t)map - UINT32_C(0x110000);
|
||||
|
||||
for (i = 0; i < sc[off].cplen; i++) {
|
||||
herodotus_write_codepoint(w, sc[off].cp[i]);
|
||||
}
|
||||
} else {
|
||||
/* we have a simple mapping */
|
||||
herodotus_write_codepoint(w, (uint_least32_t)
|
||||
((int_least32_t)cp + map));
|
||||
}
|
||||
}
|
||||
|
||||
herodotus_writer_nul_terminate(w);
|
||||
|
||||
return herodotus_writer_number_written(w);
|
||||
}
|
||||
|
||||
static size_t
|
||||
herodotus_next_word_break(const HERODOTUS_READER *r)
|
||||
{
|
||||
HERODOTUS_READER tmp;
|
||||
|
||||
herodotus_reader_copy(r, &tmp);
|
||||
|
||||
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
return grapheme_next_word_break(tmp.src, tmp.srclen);
|
||||
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
|
||||
return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
|
||||
}
|
||||
}
|
||||
|
||||
static inline size_t
|
||||
to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
|
||||
{
|
||||
enum case_property prop;
|
||||
enum herodotus_status s;
|
||||
uint_least32_t cp;
|
||||
size_t nwb;
|
||||
|
||||
for (; (nwb = herodotus_next_word_break(r)) > 0;) {
|
||||
herodotus_reader_push_advance_limit(r, nwb);
|
||||
for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
|
||||
/* check if we have a cased character */
|
||||
prop = get_case_property(cp);
|
||||
if (prop == CASE_PROP_CASED ||
|
||||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
|
||||
break;
|
||||
} else {
|
||||
/* write the data to the output verbatim, it if permits */
|
||||
herodotus_write_codepoint(w, cp);
|
||||
|
||||
/* increment reader */
|
||||
herodotus_read_codepoint(r, true, &cp);
|
||||
}
|
||||
}
|
||||
|
||||
if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
|
||||
/* we are done */
|
||||
herodotus_reader_pop_limit(r);
|
||||
break;
|
||||
} else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
|
||||
/*
|
||||
* we did not encounter any cased character
|
||||
* up to the word break
|
||||
*/
|
||||
herodotus_reader_pop_limit(r);
|
||||
continue;
|
||||
} else {
|
||||
/*
|
||||
* we encountered a cased character before the word
|
||||
* break, convert it to titlecase
|
||||
*/
|
||||
herodotus_reader_push_advance_limit(r,
|
||||
herodotus_reader_next_codepoint_break(r));
|
||||
to_case(r, w, 0, title_major, title_minor, title_special);
|
||||
herodotus_reader_pop_limit(r);
|
||||
}
|
||||
|
||||
/* cast the rest of the codepoints in the word to lowercase */
|
||||
to_case(r, w, 1, lower_major, lower_minor, lower_special);
|
||||
|
||||
/* remove the limit on the word before the next iteration */
|
||||
herodotus_reader_pop_limit(r);
|
||||
}
|
||||
|
||||
herodotus_writer_nul_terminate(w);
|
||||
|
||||
return herodotus_writer_number_written(w);
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
HERODOTUS_WRITER w;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
|
||||
herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
|
||||
|
||||
return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
HERODOTUS_WRITER w;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
|
||||
herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
|
||||
|
||||
return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
HERODOTUS_WRITER w;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
|
||||
herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
|
||||
|
||||
return to_titlecase(&r, &w);
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
HERODOTUS_WRITER w;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
|
||||
herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
|
||||
|
||||
return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
HERODOTUS_WRITER w;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
|
||||
herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
|
||||
|
||||
return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
HERODOTUS_WRITER w;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
|
||||
herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
|
||||
|
||||
return to_titlecase(&r, &w);
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_case(HERODOTUS_READER *r, const uint_least16_t *major,
|
||||
const int_least32_t *minor, const struct special_case *sc,
|
||||
size_t *output)
|
||||
{
|
||||
size_t off, i;
|
||||
bool ret = true;
|
||||
uint_least32_t cp;
|
||||
int_least32_t map;
|
||||
|
||||
for (; herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;) {
|
||||
/* get and handle case mapping */
|
||||
if (unlikely((map = get_case_offset(cp, major, minor)) >=
|
||||
INT32_C(0x110000))) {
|
||||
/* we have a special case and the offset in the sc-array
|
||||
* is the difference to 0x110000*/
|
||||
off = (uint_least32_t)map - UINT32_C(0x110000);
|
||||
|
||||
for (i = 0; i < sc[off].cplen; i++) {
|
||||
if (herodotus_read_codepoint(r, false, &cp) ==
|
||||
HERODOTUS_STATUS_SUCCESS) {
|
||||
if (cp != sc[off].cp[i]) {
|
||||
ret = false;
|
||||
goto done;
|
||||
} else {
|
||||
/* move forward */
|
||||
herodotus_read_codepoint(r, true, &cp);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* input ended and we didn't see
|
||||
* any difference so far, so this
|
||||
* string is in fact okay
|
||||
*/
|
||||
ret = true;
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* we have a simple mapping */
|
||||
if (cp != (uint_least32_t)((int_least32_t)cp + map)) {
|
||||
/* we have a difference */
|
||||
ret = false;
|
||||
goto done;
|
||||
} else {
|
||||
/* move forward */
|
||||
herodotus_read_codepoint(r, true, &cp);
|
||||
}
|
||||
}
|
||||
}
|
||||
done:
|
||||
if (output) {
|
||||
*output = herodotus_reader_number_read(r);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_titlecase(HERODOTUS_READER *r, size_t *output)
|
||||
{
|
||||
enum case_property prop;
|
||||
enum herodotus_status s;
|
||||
bool ret = true;
|
||||
uint_least32_t cp;
|
||||
size_t nwb;
|
||||
|
||||
for (; (nwb = herodotus_next_word_break(r)) > 0;) {
|
||||
herodotus_reader_push_advance_limit(r, nwb);
|
||||
for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
|
||||
/* check if we have a cased character */
|
||||
prop = get_case_property(cp);
|
||||
if (prop == CASE_PROP_CASED ||
|
||||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
|
||||
break;
|
||||
} else {
|
||||
/* increment reader */
|
||||
herodotus_read_codepoint(r, true, &cp);
|
||||
}
|
||||
}
|
||||
|
||||
if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
|
||||
/* we are done */
|
||||
break;
|
||||
} else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
|
||||
/*
|
||||
* we did not encounter any cased character
|
||||
* up to the word break
|
||||
*/
|
||||
herodotus_reader_pop_limit(r);
|
||||
continue;
|
||||
} else {
|
||||
/*
|
||||
* we encountered a cased character before the word
|
||||
* break, check if it's titlecase
|
||||
*/
|
||||
herodotus_reader_push_advance_limit(r,
|
||||
herodotus_reader_next_codepoint_break(r));
|
||||
if (!is_case(r, title_major, title_minor, title_special, NULL)) {
|
||||
ret = false;
|
||||
goto done;
|
||||
}
|
||||
herodotus_reader_pop_limit(r);
|
||||
}
|
||||
|
||||
/* check if the rest of the codepoints in the word are lowercase */
|
||||
if (!is_case(r, lower_major, lower_minor, lower_special, NULL)) {
|
||||
ret = false;
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* remove the limit on the word before the next iteration */
|
||||
herodotus_reader_pop_limit(r);
|
||||
}
|
||||
done:
|
||||
if (output) {
|
||||
*output = herodotus_reader_number_read(r);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool
|
||||
grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
|
||||
|
||||
return is_case(&r, upper_major, upper_minor, upper_special, caselen);
|
||||
}
|
||||
|
||||
bool
|
||||
grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
|
||||
|
||||
return is_case(&r, lower_major, lower_minor, lower_special, caselen);
|
||||
}
|
||||
|
||||
bool
|
||||
grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
|
||||
|
||||
return is_titlecase(&r, caselen);
|
||||
}
|
||||
|
||||
bool
|
||||
grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
|
||||
|
||||
return is_case(&r, upper_major, upper_minor, upper_special, caselen);
|
||||
}
|
||||
|
||||
bool
|
||||
grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
|
||||
|
||||
return is_case(&r, lower_major, lower_minor, lower_special, caselen);
|
||||
}
|
||||
|
||||
bool
|
||||
grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
|
||||
|
||||
return is_titlecase(&r, caselen);
|
||||
}
|
BIN
libs/libgrapheme-2.0.2/src/case.o
Normal file
BIN
libs/libgrapheme-2.0.2/src/case.o
Normal file
Binary file not shown.
243
libs/libgrapheme-2.0.2/src/character.c
Normal file
243
libs/libgrapheme-2.0.2/src/character.c
Normal file
@@ -0,0 +1,243 @@
|
||||
/* See LICENSE file for copyright and license details. */
|
||||
#include <limits.h>
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#include "../gen/character.h"
|
||||
#include "../grapheme.h"
|
||||
#include "util.h"
|
||||
|
||||
struct character_break_state {
|
||||
uint_least8_t prop;
|
||||
bool prop_set;
|
||||
bool gb11_flag;
|
||||
bool gb12_13_flag;
|
||||
};
|
||||
|
||||
static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
|
||||
[CHAR_BREAK_PROP_OTHER] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
|
||||
[CHAR_BREAK_PROP_CR] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */
|
||||
[CHAR_BREAK_PROP_EXTEND] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
|
||||
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
|
||||
[CHAR_BREAK_PROP_HANGUL_L] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
|
||||
[CHAR_BREAK_PROP_HANGUL_V] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
|
||||
[CHAR_BREAK_PROP_HANGUL_T] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
|
||||
[CHAR_BREAK_PROP_HANGUL_LV] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
|
||||
[CHAR_BREAK_PROP_HANGUL_LVT] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
|
||||
[CHAR_BREAK_PROP_PREPEND] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */
|
||||
(UINT16_C(0xFFFF) &
|
||||
~(UINT16_C(1) << CHAR_BREAK_PROP_CR |
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_LF |
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_CONTROL
|
||||
)
|
||||
), /* GB9b */
|
||||
[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
|
||||
[CHAR_BREAK_PROP_SPACINGMARK] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
|
||||
[CHAR_BREAK_PROP_ZWJ] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
|
||||
};
|
||||
static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
|
||||
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
|
||||
[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
|
||||
[CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ,
|
||||
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
|
||||
};
|
||||
static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
|
||||
[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
|
||||
};
|
||||
static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
|
||||
[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
|
||||
};
|
||||
static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
|
||||
[CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
|
||||
UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
|
||||
};
|
||||
|
||||
static inline enum char_break_property
|
||||
get_break_prop(uint_least32_t cp)
|
||||
{
|
||||
if (likely(cp <= UINT32_C(0x10FFFF))) {
|
||||
return (enum char_break_property)
|
||||
char_break_minor[char_break_major[cp >> 8] + (cp & 0xFF)];
|
||||
} else {
|
||||
return CHAR_BREAK_PROP_OTHER;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
state_serialize(const struct character_break_state *in, uint_least16_t *out)
|
||||
{
|
||||
*out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */
|
||||
(uint_least16_t)(((uint_least16_t)(in->prop_set)) << 8) | /* 9th bit */
|
||||
(uint_least16_t)(((uint_least16_t)(in->gb11_flag)) << 9) | /* 10th bit */
|
||||
(uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) << 10); /* 11th bit */
|
||||
}
|
||||
|
||||
static inline void
|
||||
state_deserialize(uint_least16_t in, struct character_break_state *out)
|
||||
{
|
||||
out->prop = in & UINT8_C(0xFF);
|
||||
out->prop_set = in & (UINT16_C(1) << 8);
|
||||
out->gb11_flag = in & (UINT16_C(1) << 9);
|
||||
out->gb12_13_flag = in & (UINT16_C(1) << 10);
|
||||
}
|
||||
|
||||
bool
|
||||
grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, uint_least16_t *s)
|
||||
{
|
||||
struct character_break_state state;
|
||||
enum char_break_property cp0_prop, cp1_prop;
|
||||
bool notbreak = false;
|
||||
|
||||
if (likely(s)) {
|
||||
state_deserialize(*s, &state);
|
||||
|
||||
if (likely(state.prop_set)) {
|
||||
cp0_prop = state.prop;
|
||||
} else {
|
||||
cp0_prop = get_break_prop(cp0);
|
||||
}
|
||||
cp1_prop = get_break_prop(cp1);
|
||||
|
||||
/* preserve prop of right codepoint for next iteration */
|
||||
state.prop = (uint_least8_t)cp1_prop;
|
||||
state.prop_set = true;
|
||||
|
||||
/* update flags */
|
||||
state.gb11_flag =
|
||||
flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
|
||||
state.gb11_flag] &
|
||||
UINT16_C(1) << cp1_prop;
|
||||
state.gb12_13_flag =
|
||||
flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS *
|
||||
state.gb12_13_flag] &
|
||||
UINT16_C(1) << cp1_prop;
|
||||
|
||||
/*
|
||||
* Apply grapheme cluster breaking algorithm (UAX #29), see
|
||||
* http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
|
||||
*/
|
||||
notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
|
||||
(dont_break_gb11[cp0_prop + state.gb11_flag *
|
||||
NUM_CHAR_BREAK_PROPS] &
|
||||
(UINT16_C(1) << cp1_prop)) ||
|
||||
(dont_break_gb12_13[cp0_prop + state.gb12_13_flag *
|
||||
NUM_CHAR_BREAK_PROPS] &
|
||||
(UINT16_C(1) << cp1_prop));
|
||||
|
||||
/* update or reset flags (when we have a break) */
|
||||
if (likely(!notbreak)) {
|
||||
state.gb11_flag = state.gb12_13_flag = false;
|
||||
}
|
||||
|
||||
state_serialize(&state, s);
|
||||
} else {
|
||||
cp0_prop = get_break_prop(cp0);
|
||||
cp1_prop = get_break_prop(cp1);
|
||||
|
||||
/*
|
||||
* Apply grapheme cluster breaking algorithm (UAX #29), see
|
||||
* http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
|
||||
*
|
||||
* Given we have no state, this behaves as if the state-booleans
|
||||
* were all set to false
|
||||
*/
|
||||
notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
|
||||
(dont_break_gb11[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
|
||||
(dont_break_gb12_13[cp0_prop] & (UINT16_C(1) << cp1_prop));
|
||||
}
|
||||
|
||||
return !notbreak;
|
||||
}
|
||||
|
||||
static size_t
|
||||
next_character_break(HERODOTUS_READER *r)
|
||||
{
|
||||
uint_least16_t state = 0;
|
||||
uint_least32_t cp0 = 0, cp1 = 0;
|
||||
|
||||
for (herodotus_read_codepoint(r, true, &cp0);
|
||||
herodotus_read_codepoint(r, false, &cp1) == HERODOTUS_STATUS_SUCCESS;
|
||||
herodotus_read_codepoint(r, true, &cp0)) {
|
||||
if (grapheme_is_character_break(cp0, cp1, &state)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return herodotus_reader_number_read(r);
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_next_character_break(const uint_least32_t *str, size_t len)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
|
||||
|
||||
return next_character_break(&r);
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_next_character_break_utf8(const char *str, size_t len)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
|
||||
|
||||
return next_character_break(&r);
|
||||
}
|
BIN
libs/libgrapheme-2.0.2/src/character.o
Normal file
BIN
libs/libgrapheme-2.0.2/src/character.o
Normal file
Binary file not shown.
510
libs/libgrapheme-2.0.2/src/line.c
Normal file
510
libs/libgrapheme-2.0.2/src/line.c
Normal file
@@ -0,0 +1,510 @@
|
||||
/* See LICENSE file for copyright and license details. */
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#include "../gen/line.h"
|
||||
#include "../grapheme.h"
|
||||
#include "util.h"
|
||||
|
||||
static inline enum line_break_property
|
||||
get_break_prop(uint_least32_t cp)
|
||||
{
|
||||
if (likely(cp <= UINT32_C(0x10FFFF))) {
|
||||
return (enum line_break_property)
|
||||
line_break_minor[line_break_major[cp >> 8] + (cp & 0xff)];
|
||||
} else {
|
||||
return LINE_BREAK_PROP_AL;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t
|
||||
next_line_break(HERODOTUS_READER *r)
|
||||
{
|
||||
HERODOTUS_READER tmp;
|
||||
enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop,
|
||||
last_non_sp_prop, last_non_sp_cm_or_zwj_prop;
|
||||
uint_least32_t cp;
|
||||
uint_least8_t lb25_level = 0;
|
||||
bool lb21a_flag = false, ri_even = true;
|
||||
|
||||
/*
|
||||
* Apply line breaking algorithm (UAX #14), see
|
||||
* https://unicode.org/reports/tr14/#Algorithm and tailoring
|
||||
* https://unicode.org/reports/tr14/#Examples (example 7),
|
||||
* given the automatic test-cases implement this example for
|
||||
* better number handling.
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* Initialize the different properties such that we have
|
||||
* a good state after the state-update in the loop
|
||||
*/
|
||||
last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
|
||||
last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS;
|
||||
|
||||
for (herodotus_read_codepoint(r, true, &cp), cp0_prop = get_break_prop(cp);
|
||||
herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;
|
||||
herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) {
|
||||
/* get property of the right codepoint */
|
||||
cp1_prop = get_break_prop(cp);
|
||||
|
||||
/* update retention-states */
|
||||
|
||||
/*
|
||||
* store the last observed non-CM-or-ZWJ-property for
|
||||
* LB9 and following.
|
||||
*/
|
||||
if (cp0_prop != LINE_BREAK_PROP_CM &&
|
||||
cp0_prop != LINE_BREAK_PROP_ZWJ) {
|
||||
/*
|
||||
* check if the property we are overwriting now is an
|
||||
* HL. If so, we set the LB21a-flag which depends on this
|
||||
* knowledge.
|
||||
*/
|
||||
lb21a_flag = (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL);
|
||||
|
||||
/* check regional indicator state */
|
||||
if (cp0_prop == LINE_BREAK_PROP_RI) {
|
||||
/*
|
||||
* The property we just shifted in is
|
||||
* a regional indicator, increasing the
|
||||
* number of consecutive RIs on the left
|
||||
* side of the breakpoint by one, changing
|
||||
* the oddness.
|
||||
*
|
||||
*/
|
||||
ri_even = !ri_even;
|
||||
} else {
|
||||
/*
|
||||
* We saw no regional indicator, so the
|
||||
* number of consecutive RIs on the left
|
||||
* side of the breakpoint is zero, which
|
||||
* is an even number.
|
||||
*
|
||||
*/
|
||||
ri_even = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Here comes a bit of magic. The tailored rule
|
||||
* LB25 (using example 7) has a very complicated
|
||||
* left-hand-side-rule of the form
|
||||
*
|
||||
* NU (NU | SY | IS)* (CL | CP)?
|
||||
*
|
||||
* but instead of backtracking, we keep the state
|
||||
* as some kind of "power level" in the variable
|
||||
*
|
||||
* lb25_level
|
||||
*
|
||||
* that goes from 0 to 3
|
||||
*
|
||||
* 0: we are not in the sequence
|
||||
* 1: we have one NU to the left of the middle
|
||||
* spot
|
||||
* 2: we have one NU and one or more (NU | SY | IS)
|
||||
* to the left of the middle spot
|
||||
* 3: we have one NU, zero or more (NU | SY | IS)
|
||||
* and one (CL | CP) to the left of the middle
|
||||
* spot
|
||||
*/
|
||||
if ((lb25_level == 0 ||
|
||||
lb25_level == 1) &&
|
||||
cp0_prop == LINE_BREAK_PROP_NU) {
|
||||
/* sequence has begun */
|
||||
lb25_level = 1;
|
||||
} else if ((lb25_level == 1 || lb25_level == 2) &&
|
||||
(cp0_prop == LINE_BREAK_PROP_NU ||
|
||||
cp0_prop == LINE_BREAK_PROP_SY ||
|
||||
cp0_prop == LINE_BREAK_PROP_IS)) {
|
||||
/* (NU | SY | IS) sequence begins or continued */
|
||||
lb25_level = 2;
|
||||
} else if ((lb25_level == 1 || lb25_level == 2) &&
|
||||
(cp0_prop == LINE_BREAK_PROP_CL ||
|
||||
cp0_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
|
||||
cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
|
||||
/* CL or CP at the end of the sequence */
|
||||
lb25_level = 3;
|
||||
} else {
|
||||
/* sequence broke */
|
||||
lb25_level = 0;
|
||||
}
|
||||
|
||||
last_non_cm_or_zwj_prop = cp0_prop;
|
||||
}
|
||||
|
||||
/*
|
||||
* store the last observed non-SP-property for LB8, LB14,
|
||||
* LB15, LB16 and LB17. LB8 gets its own unskipped property,
|
||||
* whereas the others build on top of the CM-ZWJ-skipped
|
||||
* properties as they come after LB9
|
||||
*/
|
||||
if (cp0_prop != LINE_BREAK_PROP_SP) {
|
||||
last_non_sp_prop = cp0_prop;
|
||||
}
|
||||
if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) {
|
||||
last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_prop;
|
||||
}
|
||||
|
||||
/* apply the algorithm */
|
||||
|
||||
/* LB4 */
|
||||
if (cp0_prop == LINE_BREAK_PROP_BK) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* LB5 */
|
||||
if (cp0_prop == LINE_BREAK_PROP_CR &&
|
||||
cp1_prop == LINE_BREAK_PROP_LF) {
|
||||
continue;
|
||||
}
|
||||
if (cp0_prop == LINE_BREAK_PROP_CR ||
|
||||
cp0_prop == LINE_BREAK_PROP_LF ||
|
||||
cp0_prop == LINE_BREAK_PROP_NL) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* LB6 */
|
||||
if (cp1_prop == LINE_BREAK_PROP_BK ||
|
||||
cp1_prop == LINE_BREAK_PROP_CR ||
|
||||
cp1_prop == LINE_BREAK_PROP_LF ||
|
||||
cp1_prop == LINE_BREAK_PROP_NL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB7 */
|
||||
if (cp1_prop == LINE_BREAK_PROP_SP ||
|
||||
cp1_prop == LINE_BREAK_PROP_ZW) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB8 */
|
||||
if (last_non_sp_prop == LINE_BREAK_PROP_ZW) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* LB8a */
|
||||
if (cp0_prop == LINE_BREAK_PROP_ZWJ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB9 */
|
||||
if ((cp0_prop != LINE_BREAK_PROP_BK &&
|
||||
cp0_prop != LINE_BREAK_PROP_CR &&
|
||||
cp0_prop != LINE_BREAK_PROP_LF &&
|
||||
cp0_prop != LINE_BREAK_PROP_NL &&
|
||||
cp0_prop != LINE_BREAK_PROP_SP &&
|
||||
cp0_prop != LINE_BREAK_PROP_ZW) &&
|
||||
(cp1_prop == LINE_BREAK_PROP_CM ||
|
||||
cp1_prop == LINE_BREAK_PROP_ZWJ)) {
|
||||
/*
|
||||
* given we skip them, we don't break in such
|
||||
* a sequence
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB10 is baked into the following rules */
|
||||
|
||||
/* LB11 */
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ ||
|
||||
cp1_prop == LINE_BREAK_PROP_WJ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB12 */
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB12a */
|
||||
if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP &&
|
||||
last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA &&
|
||||
last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) &&
|
||||
cp1_prop == LINE_BREAK_PROP_GL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB13 (affected by tailoring for LB25, see example 7) */
|
||||
if (cp1_prop == LINE_BREAK_PROP_EX ||
|
||||
(last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU &&
|
||||
(cp1_prop == LINE_BREAK_PROP_CL ||
|
||||
cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
|
||||
cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF ||
|
||||
cp1_prop == LINE_BREAK_PROP_IS ||
|
||||
cp1_prop == LINE_BREAK_PROP_SY))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB14 */
|
||||
if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
|
||||
last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB15 */
|
||||
if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU &&
|
||||
(cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
|
||||
cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB16 */
|
||||
if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL ||
|
||||
last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
|
||||
last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF) &&
|
||||
cp1_prop == LINE_BREAK_PROP_NS) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB17 */
|
||||
if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 &&
|
||||
cp1_prop == LINE_BREAK_PROP_B2) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB18 */
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* LB19 */
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU ||
|
||||
cp1_prop == LINE_BREAK_PROP_QU) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB20 */
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB ||
|
||||
cp1_prop == LINE_BREAK_PROP_CB) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* LB21 */
|
||||
if (cp1_prop == LINE_BREAK_PROP_BA ||
|
||||
cp1_prop == LINE_BREAK_PROP_HY ||
|
||||
cp1_prop == LINE_BREAK_PROP_NS ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB21a */
|
||||
if (lb21a_flag &&
|
||||
(last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB21b */
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY &&
|
||||
cp1_prop == LINE_BREAK_PROP_HL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB22 */
|
||||
if (cp1_prop == LINE_BREAK_PROP_IN) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB23 */
|
||||
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
|
||||
cp1_prop == LINE_BREAK_PROP_NU) {
|
||||
continue;
|
||||
}
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU &&
|
||||
(cp1_prop == LINE_BREAK_PROP_AL ||
|
||||
cp1_prop == LINE_BREAK_PROP_HL)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB23a */
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
|
||||
(cp1_prop == LINE_BREAK_PROP_ID ||
|
||||
cp1_prop == LINE_BREAK_PROP_EB ||
|
||||
cp1_prop == LINE_BREAK_PROP_EM)) {
|
||||
continue;
|
||||
}
|
||||
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) &&
|
||||
cp1_prop == LINE_BREAK_PROP_PO) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB24 */
|
||||
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) &&
|
||||
(cp1_prop == LINE_BREAK_PROP_AL ||
|
||||
cp1_prop == LINE_BREAK_PROP_HL)) {
|
||||
continue;
|
||||
}
|
||||
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
|
||||
(cp1_prop == LINE_BREAK_PROP_PR ||
|
||||
cp1_prop == LINE_BREAK_PROP_PO)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB25 (tailored with example 7) */
|
||||
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) {
|
||||
if (cp1_prop == LINE_BREAK_PROP_NU) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* this stupid rule is the reason why we cannot
|
||||
* simply have a stateful break-detection between
|
||||
* two adjacent codepoints as we have it with
|
||||
* characters.
|
||||
*/
|
||||
herodotus_reader_copy(r, &tmp);
|
||||
herodotus_read_codepoint(&tmp, true, &cp);
|
||||
if (herodotus_read_codepoint(&tmp, true, &cp) ==
|
||||
HERODOTUS_STATUS_SUCCESS &&
|
||||
(cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
|
||||
cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
|
||||
cp1_prop == LINE_BREAK_PROP_HY)) {
|
||||
if (get_break_prop(cp) == LINE_BREAK_PROP_NU) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) &&
|
||||
cp1_prop == LINE_BREAK_PROP_NU) {
|
||||
continue;
|
||||
}
|
||||
if (lb25_level == 1 &&
|
||||
(cp1_prop == LINE_BREAK_PROP_NU ||
|
||||
cp1_prop == LINE_BREAK_PROP_SY ||
|
||||
cp1_prop == LINE_BREAK_PROP_IS)) {
|
||||
continue;
|
||||
}
|
||||
if ((lb25_level == 1 || lb25_level == 2) &&
|
||||
(cp1_prop == LINE_BREAK_PROP_NU ||
|
||||
cp1_prop == LINE_BREAK_PROP_SY ||
|
||||
cp1_prop == LINE_BREAK_PROP_IS ||
|
||||
cp1_prop == LINE_BREAK_PROP_CL ||
|
||||
cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
|
||||
cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
|
||||
continue;
|
||||
}
|
||||
if ((lb25_level == 1 || lb25_level == 2 || lb25_level == 3) &&
|
||||
(cp1_prop == LINE_BREAK_PROP_PO ||
|
||||
cp1_prop == LINE_BREAK_PROP_PR)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB26 */
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL &&
|
||||
(cp1_prop == LINE_BREAK_PROP_JL ||
|
||||
cp1_prop == LINE_BREAK_PROP_JV ||
|
||||
cp1_prop == LINE_BREAK_PROP_H2 ||
|
||||
cp1_prop == LINE_BREAK_PROP_H3)) {
|
||||
continue;
|
||||
}
|
||||
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) &&
|
||||
(cp1_prop == LINE_BREAK_PROP_JV ||
|
||||
cp1_prop == LINE_BREAK_PROP_JT)) {
|
||||
continue;
|
||||
}
|
||||
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
|
||||
cp1_prop == LINE_BREAK_PROP_JT) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB27 */
|
||||
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
|
||||
cp1_prop == LINE_BREAK_PROP_PO) {
|
||||
continue;
|
||||
}
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
|
||||
(cp1_prop == LINE_BREAK_PROP_JL ||
|
||||
cp1_prop == LINE_BREAK_PROP_JV ||
|
||||
cp1_prop == LINE_BREAK_PROP_JT ||
|
||||
cp1_prop == LINE_BREAK_PROP_H2 ||
|
||||
cp1_prop == LINE_BREAK_PROP_H3)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB28 */
|
||||
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
|
||||
(cp1_prop == LINE_BREAK_PROP_AL ||
|
||||
cp1_prop == LINE_BREAK_PROP_HL)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB29 */
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS &&
|
||||
(cp1_prop == LINE_BREAK_PROP_AL ||
|
||||
cp1_prop == LINE_BREAK_PROP_HL)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB30 */
|
||||
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL ||
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) &&
|
||||
cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) {
|
||||
continue;
|
||||
}
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF &&
|
||||
(cp1_prop == LINE_BREAK_PROP_AL ||
|
||||
cp1_prop == LINE_BREAK_PROP_HL ||
|
||||
cp1_prop == LINE_BREAK_PROP_NU)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB30a */
|
||||
if (!ri_even &&
|
||||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_RI &&
|
||||
cp1_prop == LINE_BREAK_PROP_RI) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB30b */
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB &&
|
||||
cp1_prop == LINE_BREAK_PROP_EM) {
|
||||
continue;
|
||||
}
|
||||
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BOTH_CN_EXTPICT &&
|
||||
cp1_prop == LINE_BREAK_PROP_EM) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* LB31 */
|
||||
break;
|
||||
}
|
||||
|
||||
return herodotus_reader_number_read(r);
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_next_line_break(const uint_least32_t *str, size_t len)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
|
||||
|
||||
return next_line_break(&r);
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_next_line_break_utf8(const char *str, size_t len)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
|
||||
|
||||
return next_line_break(&r);
|
||||
}
|
BIN
libs/libgrapheme-2.0.2/src/line.o
Normal file
BIN
libs/libgrapheme-2.0.2/src/line.o
Normal file
Binary file not shown.
282
libs/libgrapheme-2.0.2/src/sentence.c
Normal file
282
libs/libgrapheme-2.0.2/src/sentence.c
Normal file
@@ -0,0 +1,282 @@
|
||||
/* See LICENSE file for copyright and license details. */
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#include "../gen/sentence.h"
|
||||
#include "../grapheme.h"
|
||||
#include "util.h"
|
||||
|
||||
struct sentence_break_state
|
||||
{
|
||||
uint_least8_t aterm_close_sp_level;
|
||||
uint_least8_t saterm_close_sp_parasep_level;
|
||||
};
|
||||
|
||||
static inline uint_least8_t
|
||||
get_sentence_break_prop(uint_least32_t cp)
|
||||
{
|
||||
if (likely(cp <= UINT32_C(0x10FFFF))) {
|
||||
return (uint_least8_t)
|
||||
sentence_break_minor[sentence_break_major[cp >> 8] +
|
||||
(cp & 0xff)];
|
||||
} else {
|
||||
return SENTENCE_BREAK_PROP_OTHER;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
is_skippable_sentence_prop(uint_least8_t prop)
|
||||
{
|
||||
return prop == SENTENCE_BREAK_PROP_EXTEND ||
|
||||
prop == SENTENCE_BREAK_PROP_FORMAT;
|
||||
}
|
||||
|
||||
static void
|
||||
sentence_skip_shift_callback(uint_least8_t prop, void *s)
|
||||
{
|
||||
struct sentence_break_state *state = (struct sentence_break_state *)s;
|
||||
|
||||
/*
|
||||
* Here comes a bit of magic. The rules
|
||||
* SB8, SB8a, SB9 and SB10 have very complicated
|
||||
* left-hand-side-rules of the form
|
||||
*
|
||||
* ATerm Close* Sp*
|
||||
* SATerm Close*
|
||||
* SATerm Close* Sp*
|
||||
* SATerm Close* Sp* ParaSep?
|
||||
*
|
||||
* but instead of backtracking, we keep the
|
||||
* state as some kind of "power level" in
|
||||
* two state-variables
|
||||
*
|
||||
* aterm_close_sp_level
|
||||
* saterm_close_sp_parasep_level
|
||||
*
|
||||
* that go from 0 to 3/4:
|
||||
*
|
||||
* 0: we are not in the sequence
|
||||
* 1: we have one ATerm/SATerm to the left of
|
||||
* the middle spot
|
||||
* 2: we have one ATerm/SATerm and one or more
|
||||
* Close to the left of the middle spot
|
||||
* 3: we have one ATerm/SATerm, zero or more
|
||||
* Close and one or more Sp to the left of
|
||||
* the middle spot.
|
||||
* 4: we have one SATerm, zero or more Close,
|
||||
* zero or more Sp and one ParaSep to the
|
||||
* left of the middle spot.
|
||||
*
|
||||
*/
|
||||
if ((state->aterm_close_sp_level == 0 ||
|
||||
state->aterm_close_sp_level == 1) &&
|
||||
prop == SENTENCE_BREAK_PROP_ATERM) {
|
||||
/* sequence has begun */
|
||||
state->aterm_close_sp_level = 1;
|
||||
} else if ((state->aterm_close_sp_level == 1 ||
|
||||
state->aterm_close_sp_level == 2) &&
|
||||
prop == SENTENCE_BREAK_PROP_CLOSE) {
|
||||
/* close-sequence begins or continued */
|
||||
state->aterm_close_sp_level = 2;
|
||||
} else if ((state->aterm_close_sp_level == 1 ||
|
||||
state->aterm_close_sp_level == 2 ||
|
||||
state->aterm_close_sp_level == 3) &&
|
||||
prop == SENTENCE_BREAK_PROP_SP) {
|
||||
/* sp-sequence begins or continued */
|
||||
state->aterm_close_sp_level = 3;
|
||||
} else {
|
||||
/* sequence broke */
|
||||
state->aterm_close_sp_level = 0;
|
||||
}
|
||||
|
||||
if ((state->saterm_close_sp_parasep_level == 0 ||
|
||||
state->saterm_close_sp_parasep_level == 1) &&
|
||||
(prop == SENTENCE_BREAK_PROP_STERM ||
|
||||
prop == SENTENCE_BREAK_PROP_ATERM)) {
|
||||
/* sequence has begun */
|
||||
state->saterm_close_sp_parasep_level = 1;
|
||||
} else if ((state->saterm_close_sp_parasep_level == 1 ||
|
||||
state->saterm_close_sp_parasep_level == 2) &&
|
||||
prop == SENTENCE_BREAK_PROP_CLOSE) {
|
||||
/* close-sequence begins or continued */
|
||||
state->saterm_close_sp_parasep_level = 2;
|
||||
} else if ((state->saterm_close_sp_parasep_level == 1 ||
|
||||
state->saterm_close_sp_parasep_level == 2 ||
|
||||
state->saterm_close_sp_parasep_level == 3) &&
|
||||
prop == SENTENCE_BREAK_PROP_SP) {
|
||||
/* sp-sequence begins or continued */
|
||||
state->saterm_close_sp_parasep_level = 3;
|
||||
} else if ((state->saterm_close_sp_parasep_level == 1 ||
|
||||
state->saterm_close_sp_parasep_level == 2 ||
|
||||
state->saterm_close_sp_parasep_level == 3) &&
|
||||
(prop == SENTENCE_BREAK_PROP_SEP ||
|
||||
prop == SENTENCE_BREAK_PROP_CR ||
|
||||
prop == SENTENCE_BREAK_PROP_LF)) {
|
||||
/* ParaSep at the end of the sequence */
|
||||
state->saterm_close_sp_parasep_level = 4;
|
||||
} else {
|
||||
/* sequence broke */
|
||||
state->saterm_close_sp_parasep_level = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t
|
||||
next_sentence_break(HERODOTUS_READER *r)
|
||||
{
|
||||
HERODOTUS_READER tmp;
|
||||
enum sentence_break_property prop;
|
||||
struct proper p;
|
||||
struct sentence_break_state state = { 0 };
|
||||
uint_least32_t cp;
|
||||
|
||||
/*
|
||||
* Apply sentence breaking algorithm (UAX #29), see
|
||||
* https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
|
||||
*/
|
||||
proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
|
||||
get_sentence_break_prop, is_skippable_sentence_prop,
|
||||
sentence_skip_shift_callback, &p);
|
||||
|
||||
while (!proper_advance(&p)) {
|
||||
/* SB3 */
|
||||
if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
|
||||
p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* SB4 */
|
||||
if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
|
||||
p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR ||
|
||||
p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* SB5 */
|
||||
if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
|
||||
p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* SB6 */
|
||||
if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
|
||||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* SB7 */
|
||||
if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
|
||||
p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
|
||||
p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
|
||||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* SB8 */
|
||||
if (state.aterm_close_sp_level == 1 ||
|
||||
state.aterm_close_sp_level == 2 ||
|
||||
state.aterm_close_sp_level == 3) {
|
||||
/*
|
||||
* This is the most complicated rule, requiring
|
||||
* the right-hand-side to satisfy the regular expression
|
||||
*
|
||||
* ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
|
||||
*
|
||||
* which we simply check "manually" given LUT-lookups
|
||||
* are very cheap by starting at the mid_reader.
|
||||
*
|
||||
*/
|
||||
herodotus_reader_copy(&(p.mid_reader), &tmp);
|
||||
|
||||
prop = NUM_SENTENCE_BREAK_PROPS;
|
||||
while (herodotus_read_codepoint(&tmp, true, &cp) ==
|
||||
HERODOTUS_STATUS_SUCCESS) {
|
||||
prop = get_sentence_break_prop(cp);
|
||||
|
||||
/*
|
||||
* the skippable properties are ignored
|
||||
* automatically here given they do not
|
||||
* match the following condition
|
||||
*/
|
||||
if (prop == SENTENCE_BREAK_PROP_OLETTER ||
|
||||
prop == SENTENCE_BREAK_PROP_UPPER ||
|
||||
prop == SENTENCE_BREAK_PROP_LOWER ||
|
||||
prop == SENTENCE_BREAK_PROP_SEP ||
|
||||
prop == SENTENCE_BREAK_PROP_CR ||
|
||||
prop == SENTENCE_BREAK_PROP_LF ||
|
||||
prop == SENTENCE_BREAK_PROP_STERM ||
|
||||
prop == SENTENCE_BREAK_PROP_ATERM) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (prop == SENTENCE_BREAK_PROP_LOWER) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* SB8a */
|
||||
if ((state.saterm_close_sp_parasep_level == 1 ||
|
||||
state.saterm_close_sp_parasep_level == 2 ||
|
||||
state.saterm_close_sp_parasep_level == 3) &&
|
||||
(p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
|
||||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM ||
|
||||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* SB9 */
|
||||
if ((state.saterm_close_sp_parasep_level == 1 ||
|
||||
state.saterm_close_sp_parasep_level == 2) &&
|
||||
(p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
|
||||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
|
||||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
|
||||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
|
||||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* SB10 */
|
||||
if ((state.saterm_close_sp_parasep_level == 1 ||
|
||||
state.saterm_close_sp_parasep_level == 2 ||
|
||||
state.saterm_close_sp_parasep_level == 3) &&
|
||||
(p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
|
||||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
|
||||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
|
||||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* SB11 */
|
||||
if (state.saterm_close_sp_parasep_level == 1 ||
|
||||
state.saterm_close_sp_parasep_level == 2 ||
|
||||
state.saterm_close_sp_parasep_level == 3 ||
|
||||
state.saterm_close_sp_parasep_level == 4) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* SB998 */
|
||||
continue;
|
||||
}
|
||||
|
||||
return herodotus_reader_number_read(&(p.mid_reader));
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
|
||||
|
||||
return next_sentence_break(&r);
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_next_sentence_break_utf8(const char *str, size_t len)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
|
||||
|
||||
return next_sentence_break(&r);
|
||||
}
|
BIN
libs/libgrapheme-2.0.2/src/sentence.o
Normal file
BIN
libs/libgrapheme-2.0.2/src/sentence.o
Normal file
Binary file not shown.
219
libs/libgrapheme-2.0.2/src/utf8.c
Normal file
219
libs/libgrapheme-2.0.2/src/utf8.c
Normal file
@@ -0,0 +1,219 @@
|
||||
/* See LICENSE file for copyright and license details. */
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "../grapheme.h"
|
||||
#include "util.h"
|
||||
|
||||
#define BETWEEN(c, l, u) ((c) >= (l) && (c) <= (u))
|
||||
|
||||
/* lookup-table for the types of sequence first bytes */
|
||||
static const struct {
|
||||
uint_least8_t lower; /* lower bound of sequence first byte */
|
||||
uint_least8_t upper; /* upper bound of sequence first byte */
|
||||
uint_least32_t mincp; /* smallest non-overlong encoded codepoint */
|
||||
uint_least32_t maxcp; /* largest encodable codepoint */
|
||||
/*
|
||||
* implicit: table-offset represents the number of following
|
||||
* bytes of the form 10xxxxxx (6 bits capacity each)
|
||||
*/
|
||||
} lut[] = {
|
||||
[0] = {
|
||||
/* 0xxxxxxx */
|
||||
.lower = 0x00, /* 00000000 */
|
||||
.upper = 0x7F, /* 01111111 */
|
||||
.mincp = (uint_least32_t)0,
|
||||
.maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacity */
|
||||
},
|
||||
[1] = {
|
||||
/* 110xxxxx */
|
||||
.lower = 0xC0, /* 11000000 */
|
||||
.upper = 0xDF, /* 11011111 */
|
||||
.mincp = (uint_least32_t)1 << 7,
|
||||
.maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
|
||||
},
|
||||
[2] = {
|
||||
/* 1110xxxx */
|
||||
.lower = 0xE0, /* 11100000 */
|
||||
.upper = 0xEF, /* 11101111 */
|
||||
.mincp = (uint_least32_t)1 << 11,
|
||||
.maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
|
||||
},
|
||||
[3] = {
|
||||
/* 11110xxx */
|
||||
.lower = 0xF0, /* 11110000 */
|
||||
.upper = 0xF7, /* 11110111 */
|
||||
.mincp = (uint_least32_t)1 << 16,
|
||||
.maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
|
||||
},
|
||||
};
|
||||
|
||||
size_t
|
||||
grapheme_decode_utf8(const char *str, size_t len, uint_least32_t *cp)
|
||||
{
|
||||
size_t off, i;
|
||||
uint_least32_t tmp;
|
||||
|
||||
if (cp == NULL) {
|
||||
/*
|
||||
* instead of checking every time if cp is NULL within
|
||||
* the decoder, simply point it at a dummy variable here.
|
||||
*/
|
||||
cp = &tmp;
|
||||
}
|
||||
|
||||
if (str == NULL || len == 0) {
|
||||
/* a sequence must be at least 1 byte long */
|
||||
*cp = GRAPHEME_INVALID_CODEPOINT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* identify sequence type with the first byte */
|
||||
for (off = 0; off < LEN(lut); off++) {
|
||||
if (BETWEEN(((const unsigned char *)str)[0], lut[off].lower,
|
||||
lut[off].upper)) {
|
||||
/*
|
||||
* first byte is within the bounds; fill
|
||||
* p with the the first bits contained in
|
||||
* the first byte (by subtracting the high bits)
|
||||
*/
|
||||
*cp = ((const unsigned char *)str)[0] - lut[off].lower;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (off == LEN(lut)) {
|
||||
/*
|
||||
* first byte does not match a sequence type;
|
||||
* set cp as invalid and return 1 byte processed
|
||||
*
|
||||
* this also includes the cases where bits higher than
|
||||
* the 8th are set on systems with CHAR_BIT > 8
|
||||
*/
|
||||
*cp = GRAPHEME_INVALID_CODEPOINT;
|
||||
return 1;
|
||||
}
|
||||
if (1 + off > len) {
|
||||
/*
|
||||
* input is not long enough, set cp as invalid
|
||||
*/
|
||||
*cp = GRAPHEME_INVALID_CODEPOINT;
|
||||
|
||||
/*
|
||||
* count the following continuation bytes, but nothing
|
||||
* else in case we have a "rogue" case where e.g. such a
|
||||
* sequence starter occurs right before a NUL-byte.
|
||||
*/
|
||||
for (i = 0; 1 + i < len; i++) {
|
||||
if(!BETWEEN(((const unsigned char *)str)[1 + i],
|
||||
0x80, 0xBF)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* if the continuation bytes do not continue until
|
||||
* the end, return the incomplete sequence length.
|
||||
* Otherwise return the number of bytes we actually
|
||||
* expected, which is larger than n.
|
||||
*/
|
||||
return ((1 + i) < len) ? (1 + i) : (1 + off);
|
||||
}
|
||||
|
||||
/*
|
||||
* process 'off' following bytes, each of the form 10xxxxxx
|
||||
* (i.e. between 0x80 (10000000) and 0xBF (10111111))
|
||||
*/
|
||||
for (i = 1; i <= off; i++) {
|
||||
if(!BETWEEN(((const unsigned char *)str)[i], 0x80, 0xBF)) {
|
||||
/*
|
||||
* byte does not match format; return
|
||||
* number of bytes processed excluding the
|
||||
* unexpected character as recommended since
|
||||
* Unicode 6 (chapter 3)
|
||||
*
|
||||
* this also includes the cases where bits
|
||||
* higher than the 8th are set on systems
|
||||
* with CHAR_BIT > 8
|
||||
*/
|
||||
*cp = GRAPHEME_INVALID_CODEPOINT;
|
||||
return 1 + (i - 1);
|
||||
}
|
||||
/*
|
||||
* shift codepoint by 6 bits and add the 6 stored bits
|
||||
* in s[i] to it using the bitmask 0x3F (00111111)
|
||||
*/
|
||||
*cp = (*cp << 6) | (((const unsigned char *)str)[i] & 0x3F);
|
||||
}
|
||||
|
||||
if (*cp < lut[off].mincp ||
|
||||
BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
|
||||
*cp > UINT32_C(0x10FFFF)) {
|
||||
/*
|
||||
* codepoint is overlong encoded in the sequence, is a
|
||||
* high or low UTF-16 surrogate half (0xD800..0xDFFF) or
|
||||
* not representable in UTF-16 (>0x10FFFF) (RFC-3629
|
||||
* specifies the latter two conditions)
|
||||
*/
|
||||
*cp = GRAPHEME_INVALID_CODEPOINT;
|
||||
}
|
||||
|
||||
return 1 + off;
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_encode_utf8(uint_least32_t cp, char *str, size_t len)
|
||||
{
|
||||
size_t off, i;
|
||||
|
||||
if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
|
||||
cp > UINT32_C(0x10FFFF)) {
|
||||
/*
|
||||
* codepoint is a high or low UTF-16 surrogate half
|
||||
* (0xD800..0xDFFF) or not representable in UTF-16
|
||||
* (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
|
||||
*/
|
||||
cp = GRAPHEME_INVALID_CODEPOINT;
|
||||
}
|
||||
|
||||
/* determine necessary sequence type */
|
||||
for (off = 0; off < LEN(lut); off++) {
|
||||
if (cp <= lut[off].maxcp) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (1 + off > len || str == NULL || len == 0) {
|
||||
/*
|
||||
* specified buffer is too small to store sequence or
|
||||
* the caller just wanted to know how many bytes the
|
||||
* codepoint needs by passing a NULL-buffer.
|
||||
*/
|
||||
return 1 + off;
|
||||
}
|
||||
|
||||
/* build sequence by filling cp-bits into each byte */
|
||||
|
||||
/*
|
||||
* lut[off].lower is the bit-format for the first byte and
|
||||
* the bits to fill into it are determined by shifting the
|
||||
* cp 6 times the number of following bytes, as each
|
||||
* following byte stores 6 bits, yielding the wanted bits.
|
||||
*
|
||||
* We do not overwrite the mask because we guaranteed earlier
|
||||
* that there are no bits higher than the mask allows.
|
||||
*/
|
||||
((unsigned char *)str)[0] = lut[off].lower |
|
||||
(uint_least8_t)(cp >> (6 * off));
|
||||
|
||||
for (i = 1; i <= off; i++) {
|
||||
/*
|
||||
* the bit-format for following bytes is 10000000 (0x80)
|
||||
* and it each stores 6 bits in the 6 low bits that we
|
||||
* extract from the properly-shifted value using the
|
||||
* mask 00111111 (0x3F)
|
||||
*/
|
||||
((unsigned char *)str)[i] = 0x80 |
|
||||
((cp >> (6 * (off - i))) & 0x3F);
|
||||
}
|
||||
|
||||
return 1 + off;
|
||||
}
|
BIN
libs/libgrapheme-2.0.2/src/utf8.o
Normal file
BIN
libs/libgrapheme-2.0.2/src/utf8.o
Normal file
Binary file not shown.
417
libs/libgrapheme-2.0.2/src/util.c
Normal file
417
libs/libgrapheme-2.0.2/src/util.c
Normal file
@@ -0,0 +1,417 @@
|
||||
/* See LICENSE file for copyright and license details. */
|
||||
#include <limits.h>
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "../gen/types.h"
|
||||
#include "../grapheme.h"
|
||||
#include "util.h"
|
||||
|
||||
void
|
||||
herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type,
|
||||
const void *src, size_t srclen)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
r->type = type;
|
||||
r->src = src;
|
||||
r->srclen = srclen;
|
||||
r->off = 0;
|
||||
r->terminated_by_null = false;
|
||||
|
||||
for (i = 0; i < LEN(r->soft_limit); i++) {
|
||||
r->soft_limit[i] = SIZE_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
/*
|
||||
* we copy such that we have a "fresh" start and build on the
|
||||
* fact that src->soft_limit[i] for any i and src->srclen are
|
||||
* always larger or equal to src->off
|
||||
*/
|
||||
dest->type = src->type;
|
||||
if (src->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
dest->src = (src->src == NULL) ? NULL :
|
||||
((const uint_least32_t *)(src->src)) + src->off;
|
||||
} else { /* src->type == HERODOTUS_TYPE_UTF8 */
|
||||
dest->src = (src->src == NULL) ? NULL :
|
||||
((const char *)(src->src)) + src->off;
|
||||
}
|
||||
if (src->srclen == SIZE_MAX) {
|
||||
dest->srclen = SIZE_MAX;
|
||||
} else {
|
||||
dest->srclen = (src->off < src->srclen) ? src->srclen - src->off : 0;
|
||||
}
|
||||
dest->off = 0;
|
||||
dest->terminated_by_null = src->terminated_by_null;
|
||||
|
||||
for (i = 0; i < LEN(src->soft_limit); i++) {
|
||||
if (src->soft_limit[i] == SIZE_MAX) {
|
||||
dest->soft_limit[i] = SIZE_MAX;
|
||||
} else {
|
||||
/*
|
||||
* if we have a degenerate case where the offset is
|
||||
* higher than the soft-limit, we simply clamp the
|
||||
* soft-limit to zero given we can't decide here
|
||||
* to release the limit and, instead, we just
|
||||
* prevent any more reads
|
||||
*/
|
||||
dest->soft_limit[i] = (src->off < src->soft_limit[i]) ?
|
||||
src->soft_limit[i] - src->off : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
for (i = LEN(r->soft_limit) - 1; i >= 1; i--) {
|
||||
r->soft_limit[i] = r->soft_limit[i - 1];
|
||||
}
|
||||
r->soft_limit[0] = r->off + count;
|
||||
}
|
||||
|
||||
void
|
||||
herodotus_reader_pop_limit(HERODOTUS_READER *r)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < LEN(r->soft_limit) - 1; i++) {
|
||||
r->soft_limit[i] = r->soft_limit[i + 1];
|
||||
}
|
||||
r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX;
|
||||
}
|
||||
|
||||
size_t
|
||||
herodotus_reader_next_word_break(const HERODOTUS_READER *r)
|
||||
{
|
||||
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
return grapheme_next_word_break(
|
||||
(const uint_least32_t *)(r->src) + r->off,
|
||||
MIN(r->srclen, r->soft_limit[0]) - r->off);
|
||||
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
|
||||
return grapheme_next_word_break_utf8(
|
||||
(const char *)(r->src) + r->off,
|
||||
MIN(r->srclen, r->soft_limit[0]) - r->off);
|
||||
}
|
||||
}
|
||||
|
||||
size_t
|
||||
herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r)
|
||||
{
|
||||
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0;
|
||||
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
|
||||
return grapheme_decode_utf8(
|
||||
(const char *)(r->src) + r->off,
|
||||
MIN(r->srclen, r->soft_limit[0]) - r->off, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
size_t
|
||||
herodotus_reader_number_read(const HERODOTUS_READER *r)
|
||||
{
|
||||
return r->off;
|
||||
}
|
||||
|
||||
enum herodotus_status
|
||||
herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp)
|
||||
{
|
||||
size_t ret;
|
||||
|
||||
if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) {
|
||||
*cp = GRAPHEME_INVALID_CODEPOINT;
|
||||
return HERODOTUS_STATUS_END_OF_BUFFER;
|
||||
}
|
||||
|
||||
if (r->off >= r->soft_limit[0]) {
|
||||
*cp = GRAPHEME_INVALID_CODEPOINT;
|
||||
return HERODOTUS_STATUS_SOFT_LIMIT_REACHED;
|
||||
}
|
||||
|
||||
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
*cp = ((const uint_least32_t *)(r->src))[r->off];
|
||||
ret = 1;
|
||||
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
|
||||
ret = grapheme_decode_utf8((const char *)r->src + r->off,
|
||||
MIN(r->srclen, r->soft_limit[0]) -
|
||||
r->off, cp);
|
||||
}
|
||||
|
||||
if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) {
|
||||
/*
|
||||
* We encountered a null-codepoint. Don't increment
|
||||
* offset and return as if the buffer had ended here all
|
||||
* along
|
||||
*/
|
||||
r->terminated_by_null = true;
|
||||
return HERODOTUS_STATUS_END_OF_BUFFER;
|
||||
}
|
||||
|
||||
if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) {
|
||||
/*
|
||||
* we want more than we have; instead of returning
|
||||
* garbage we terminate here.
|
||||
*/
|
||||
return HERODOTUS_STATUS_END_OF_BUFFER;
|
||||
}
|
||||
|
||||
/*
|
||||
* Increase offset which we now know won't surpass the limits,
|
||||
* unless we got told otherwise
|
||||
*/
|
||||
if (advance) {
|
||||
r->off += ret;
|
||||
}
|
||||
|
||||
return HERODOTUS_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void
|
||||
herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type,
|
||||
void *dest, size_t destlen)
|
||||
{
|
||||
w->type = type;
|
||||
w->dest = dest;
|
||||
w->destlen = destlen;
|
||||
w->off = 0;
|
||||
w->first_unwritable_offset = SIZE_MAX;
|
||||
}
|
||||
|
||||
void
|
||||
herodotus_writer_nul_terminate(HERODOTUS_WRITER *w)
|
||||
{
|
||||
if (w->dest == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (w->off < w->destlen) {
|
||||
/* We still have space in the buffer. Simply use it */
|
||||
if (w->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
((uint_least32_t *)(w->dest))[w->off] = 0;
|
||||
} else { /* w->type == HERODOTUS_TYPE_UTF8 */
|
||||
((char *)(w->dest))[w->off] = '\0';
|
||||
}
|
||||
} else if (w->first_unwritable_offset < w->destlen) {
|
||||
/*
|
||||
* There is no more space in the buffer. However,
|
||||
* we have noted down the first offset we couldn't
|
||||
* use to write into the buffer and it's smaller than
|
||||
* destlen. Thus we bailed writing into the
|
||||
* destination when a multibyte-codepoint couldn't be
|
||||
* written. So the last "real" byte might be at
|
||||
* destlen-4, destlen-3, destlen-2 or destlen-1
|
||||
* (the last case meaning truncation).
|
||||
*/
|
||||
if (w->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
((uint_least32_t *)(w->dest))
|
||||
[w->first_unwritable_offset] = 0;
|
||||
} else { /* w->type == HERODOTUS_TYPE_UTF8 */
|
||||
((char *)(w->dest))[w->first_unwritable_offset] = '\0';
|
||||
}
|
||||
} else if (w->destlen > 0) {
|
||||
/*
|
||||
* In this case, there is no more space in the buffer and
|
||||
* the last unwritable offset is larger than
|
||||
* or equal to the destination buffer length. This means
|
||||
* that we are forced to simply write into the last
|
||||
* byte.
|
||||
*/
|
||||
if (w->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
((uint_least32_t *)(w->dest))
|
||||
[w->destlen - 1] = 0;
|
||||
} else { /* w->type == HERODOTUS_TYPE_UTF8 */
|
||||
((char *)(w->dest))[w->destlen - 1] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
/* w->off is not incremented in any case */
|
||||
}
|
||||
|
||||
size_t
|
||||
herodotus_writer_number_written(const HERODOTUS_WRITER *w)
|
||||
{
|
||||
return w->off;
|
||||
}
|
||||
|
||||
void
|
||||
herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
|
||||
{
|
||||
size_t ret;
|
||||
|
||||
/*
|
||||
* This function will always faithfully say how many codepoints
|
||||
* were written, even if the buffer ends. This is used to enable
|
||||
* truncation detection.
|
||||
*/
|
||||
if (w->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
if (w->dest != NULL && w->off < w->destlen) {
|
||||
((uint_least32_t *)(w->dest))[w->off] = cp;
|
||||
}
|
||||
|
||||
w->off += 1;
|
||||
} else { /* w->type == HERODOTUS_TYPE_UTF8 */
|
||||
/*
|
||||
* First determine how many bytes we need to encode the
|
||||
* codepoint
|
||||
*/
|
||||
ret = grapheme_encode_utf8(cp, NULL, 0);
|
||||
|
||||
if (w->dest != NULL && w->off + ret < w->destlen) {
|
||||
/* we still have enough room in the buffer */
|
||||
grapheme_encode_utf8(cp, (char *)(w->dest) +
|
||||
w->off, w->destlen - w->off);
|
||||
} else if (w->first_unwritable_offset == SIZE_MAX) {
|
||||
/*
|
||||
* the first unwritable offset has not been
|
||||
* noted down, so this is the first time we can't
|
||||
* write (completely) to an offset
|
||||
*/
|
||||
w->first_unwritable_offset = w->off;
|
||||
}
|
||||
|
||||
w->off += ret;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
|
||||
uint_least8_t (*get_break_prop)(uint_least32_t),
|
||||
bool (*is_skippable_prop)(uint_least8_t),
|
||||
void (*skip_shift_callback)(uint_least8_t, void *),
|
||||
struct proper *p)
|
||||
{
|
||||
uint_least8_t prop;
|
||||
uint_least32_t cp;
|
||||
size_t i;
|
||||
|
||||
/* set internal variables */
|
||||
p->state = state;
|
||||
p->no_prop = no_prop;
|
||||
p->get_break_prop = get_break_prop;
|
||||
p->is_skippable_prop = is_skippable_prop;
|
||||
p->skip_shift_callback = skip_shift_callback;
|
||||
|
||||
/*
|
||||
* Initialize mid-reader, which is basically just there
|
||||
* to reflect the current position of the viewing-line
|
||||
*/
|
||||
herodotus_reader_copy(r, &(p->mid_reader));
|
||||
|
||||
/*
|
||||
* In the initialization, we simply (try to) fill in next_prop.
|
||||
* If we cannot read in more (due to the buffer ending), we
|
||||
* fill in the prop as invalid
|
||||
*/
|
||||
|
||||
/*
|
||||
* initialize the previous properties to have no property
|
||||
* (given we are at the start of the buffer)
|
||||
*/
|
||||
p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
|
||||
p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
|
||||
|
||||
/*
|
||||
* initialize the next properties
|
||||
*/
|
||||
|
||||
/* initialize the raw reader */
|
||||
herodotus_reader_copy(r, &(p->raw_reader));
|
||||
|
||||
/* fill in the two next raw properties (after no-initialization) */
|
||||
p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
|
||||
for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
|
||||
HERODOTUS_STATUS_SUCCESS; ) {
|
||||
p->raw.next_prop[i++] = p->get_break_prop(cp);
|
||||
}
|
||||
|
||||
/* initialize the skip reader */
|
||||
herodotus_reader_copy(r, &(p->skip_reader));
|
||||
|
||||
/* fill in the two next skip properties (after no-initialization) */
|
||||
p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
|
||||
for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
|
||||
HERODOTUS_STATUS_SUCCESS; ) {
|
||||
prop = p->get_break_prop(cp);
|
||||
if (!p->is_skippable_prop(prop)) {
|
||||
p->skip.next_prop[i++] = prop;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
proper_advance(struct proper *p)
|
||||
{
|
||||
uint_least8_t prop;
|
||||
uint_least32_t cp;
|
||||
|
||||
/* read in next "raw" property */
|
||||
if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
|
||||
HERODOTUS_STATUS_SUCCESS) {
|
||||
prop = p->get_break_prop(cp);
|
||||
} else {
|
||||
prop = p->no_prop;
|
||||
}
|
||||
|
||||
/*
|
||||
* do a shift-in, unless we find that the property that is to
|
||||
* be moved past the "raw-viewing-line" (this property is stored
|
||||
* in p->raw.next_prop[0]) is a no_prop, indicating that
|
||||
* we are at the end of the buffer.
|
||||
*/
|
||||
if (p->raw.next_prop[0] == p->no_prop) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* shift in the properties */
|
||||
p->raw.prev_prop[1] = p->raw.prev_prop[0];
|
||||
p->raw.prev_prop[0] = p->raw.next_prop[0];
|
||||
p->raw.next_prop[0] = p->raw.next_prop[1];
|
||||
p->raw.next_prop[1] = prop;
|
||||
|
||||
/* advance the middle reader viewing-line */
|
||||
(void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
|
||||
|
||||
/* check skippability-property */
|
||||
if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
|
||||
/*
|
||||
* the property that has moved past the "raw-viewing-line"
|
||||
* (this property is now (after the raw-shift) stored in
|
||||
* p->raw.prev_prop[0] and guaranteed not to be a no-prop,
|
||||
* guaranteeing that we won't shift a no-prop past the
|
||||
* "viewing-line" in the skip-properties) is not a skippable
|
||||
* property, thus we need to shift the skip property as well.
|
||||
*/
|
||||
p->skip.prev_prop[1] = p->skip.prev_prop[0];
|
||||
p->skip.prev_prop[0] = p->skip.next_prop[0];
|
||||
p->skip.next_prop[0] = p->skip.next_prop[1];
|
||||
|
||||
/*
|
||||
* call the skip-shift-callback on the property that
|
||||
* passed the skip-viewing-line (this property is now
|
||||
* stored in p->skip.prev_prop[0]).
|
||||
*/
|
||||
p->skip_shift_callback(p->skip.prev_prop[0], p->state);
|
||||
|
||||
/* determine the next shift property */
|
||||
p->skip.next_prop[1] = p->no_prop;
|
||||
while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
|
||||
HERODOTUS_STATUS_SUCCESS) {
|
||||
prop = p->get_break_prop(cp);
|
||||
if (!p->is_skippable_prop(prop)) {
|
||||
p->skip.next_prop[1] = prop;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
116
libs/libgrapheme-2.0.2/src/util.h
Normal file
116
libs/libgrapheme-2.0.2/src/util.h
Normal file
@@ -0,0 +1,116 @@
|
||||
/* See LICENSE file for copyright and license details. */
|
||||
#ifndef UTIL_H
|
||||
#define UTIL_H
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "../gen/types.h"
|
||||
#include "../grapheme.h"
|
||||
|
||||
#undef MIN
|
||||
#define MIN(x,y) ((x) < (y) ? (x) : (y))
|
||||
#undef LEN
|
||||
#define LEN(x) (sizeof(x) / sizeof(*(x)))
|
||||
|
||||
#undef likely
|
||||
#undef unlikely
|
||||
#ifdef __has_builtin
|
||||
#if __has_builtin(__builtin_expect)
|
||||
#define likely(expr) __builtin_expect(!!(expr), 1)
|
||||
#define unlikely(expr) __builtin_expect(!!(expr), 0)
|
||||
#else
|
||||
#define likely(expr) (expr)
|
||||
#define unlikely(expr) (expr)
|
||||
#endif
|
||||
#else
|
||||
#define likely(expr) (expr)
|
||||
#define unlikely(expr) (expr)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Herodotus, the ancient greek historian and geographer,
|
||||
* was criticized for including legends and other fantastic
|
||||
* accounts into his works, among others by his contemporary
|
||||
* Thucydides.
|
||||
*
|
||||
* The Herodotus readers and writers are tailored towards the needs
|
||||
* of the library interface, doing all the dirty work behind the
|
||||
* scenes. While the reader is relatively faithful in his accounts,
|
||||
* the Herodotus writer will never fail and always claim to write the
|
||||
* data. Internally, it only writes as much as it can, and will simply
|
||||
* keep account of the rest. This way, we can properly signal truncation.
|
||||
*
|
||||
* In this sense, explaining the naming, the writer is always a bit
|
||||
* inaccurate in his accounts.
|
||||
*
|
||||
*/
|
||||
enum herodotus_status {
|
||||
HERODOTUS_STATUS_SUCCESS,
|
||||
HERODOTUS_STATUS_END_OF_BUFFER,
|
||||
HERODOTUS_STATUS_SOFT_LIMIT_REACHED,
|
||||
};
|
||||
|
||||
enum herodotus_type {
|
||||
HERODOTUS_TYPE_CODEPOINT,
|
||||
HERODOTUS_TYPE_UTF8,
|
||||
};
|
||||
|
||||
typedef struct herodotus_reader {
|
||||
enum herodotus_type type;
|
||||
const void *src;
|
||||
size_t srclen;
|
||||
size_t off;
|
||||
bool terminated_by_null;
|
||||
size_t soft_limit[10];
|
||||
} HERODOTUS_READER;
|
||||
|
||||
typedef struct herodotus_writer {
|
||||
enum herodotus_type type;
|
||||
void *dest;
|
||||
size_t destlen;
|
||||
size_t off;
|
||||
size_t first_unwritable_offset;
|
||||
} HERODOTUS_WRITER;
|
||||
|
||||
struct proper {
|
||||
/*
|
||||
* prev_prop[1] prev_prop[0] | next_prop[0] next_prop[1]
|
||||
*/
|
||||
struct {
|
||||
uint_least8_t prev_prop[2];
|
||||
uint_least8_t next_prop[2];
|
||||
} raw, skip;
|
||||
HERODOTUS_READER mid_reader, raw_reader, skip_reader;
|
||||
void *state;
|
||||
uint_least8_t no_prop;
|
||||
uint_least8_t (*get_break_prop)(uint_least32_t);
|
||||
bool (*is_skippable_prop)(uint_least8_t);
|
||||
void (*skip_shift_callback)(uint_least8_t, void *);
|
||||
};
|
||||
|
||||
void herodotus_reader_init(HERODOTUS_READER *, enum herodotus_type,
|
||||
const void *, size_t);
|
||||
void herodotus_reader_copy(const HERODOTUS_READER *, HERODOTUS_READER *);
|
||||
void herodotus_reader_push_advance_limit(HERODOTUS_READER *, size_t);
|
||||
void herodotus_reader_pop_limit(HERODOTUS_READER *);
|
||||
size_t herodotus_reader_number_read(const HERODOTUS_READER *);
|
||||
size_t herodotus_reader_next_word_break(const HERODOTUS_READER *);
|
||||
size_t herodotus_reader_next_codepoint_break(const HERODOTUS_READER *);
|
||||
enum herodotus_status herodotus_read_codepoint(HERODOTUS_READER *, bool, uint_least32_t *);
|
||||
|
||||
void herodotus_writer_init(HERODOTUS_WRITER *, enum herodotus_type, void *,
|
||||
size_t);
|
||||
void herodotus_writer_nul_terminate(HERODOTUS_WRITER *);
|
||||
size_t herodotus_writer_number_written(const HERODOTUS_WRITER *);
|
||||
void herodotus_write_codepoint(HERODOTUS_WRITER *, uint_least32_t);
|
||||
|
||||
void proper_init(const HERODOTUS_READER *, void *, uint_least8_t,
|
||||
uint_least8_t (*get_break_prop)(uint_least32_t),
|
||||
bool (*is_skippable_prop)(uint_least8_t),
|
||||
void (*skip_shift_callback)(uint_least8_t, void *),
|
||||
struct proper *);
|
||||
int proper_advance(struct proper *);
|
||||
|
||||
#endif /* UTIL_H */
|
BIN
libs/libgrapheme-2.0.2/src/util.o
Normal file
BIN
libs/libgrapheme-2.0.2/src/util.o
Normal file
Binary file not shown.
268
libs/libgrapheme-2.0.2/src/word.c
Normal file
268
libs/libgrapheme-2.0.2/src/word.c
Normal file
@@ -0,0 +1,268 @@
|
||||
/* See LICENSE file for copyright and license details. */
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#include "../gen/word.h"
|
||||
#include "../grapheme.h"
|
||||
#include "util.h"
|
||||
|
||||
struct word_break_state
|
||||
{
|
||||
bool ri_even;
|
||||
};
|
||||
|
||||
static inline uint_least8_t
|
||||
get_word_break_prop(uint_least32_t cp)
|
||||
{
|
||||
if (likely(cp <= 0x10FFFF)) {
|
||||
return (uint_least8_t)
|
||||
word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
|
||||
} else {
|
||||
return WORD_BREAK_PROP_OTHER;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
is_skippable_word_prop(uint_least8_t prop)
|
||||
{
|
||||
return prop == WORD_BREAK_PROP_EXTEND ||
|
||||
prop == WORD_BREAK_PROP_FORMAT ||
|
||||
prop == WORD_BREAK_PROP_ZWJ;
|
||||
}
|
||||
|
||||
static void
|
||||
word_skip_shift_callback(uint_least8_t prop, void *s)
|
||||
{
|
||||
struct word_break_state *state = (struct word_break_state *)s;
|
||||
|
||||
if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
|
||||
/*
|
||||
* The property we just shifted in is
|
||||
* a regional indicator, increasing the
|
||||
* number of consecutive RIs on the left
|
||||
* side of the breakpoint by one, changing
|
||||
* the oddness.
|
||||
*
|
||||
*/
|
||||
state->ri_even = !(state->ri_even);
|
||||
} else {
|
||||
/*
|
||||
* We saw no regional indicator, so the
|
||||
* number of consecutive RIs on the left
|
||||
* side of the breakpoint is zero, which
|
||||
* is an even number.
|
||||
*
|
||||
*/
|
||||
state->ri_even = true;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t
|
||||
next_word_break(HERODOTUS_READER *r)
|
||||
{
|
||||
struct proper p;
|
||||
struct word_break_state state = { .ri_even = true };
|
||||
|
||||
/*
|
||||
* Apply word breaking algorithm (UAX #29), see
|
||||
* https://unicode.org/reports/tr29/#Word_Boundary_Rules
|
||||
*/
|
||||
proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
|
||||
is_skippable_word_prop, word_skip_shift_callback, &p);
|
||||
|
||||
while (!proper_advance(&p)) {
|
||||
/* WB3 */
|
||||
if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
|
||||
p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB3a */
|
||||
if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
|
||||
p.raw.prev_prop[0] == WORD_BREAK_PROP_CR ||
|
||||
p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* WB3b */
|
||||
if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
|
||||
p.raw.next_prop[0] == WORD_BREAK_PROP_CR ||
|
||||
p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* WB3c */
|
||||
if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
|
||||
(p.raw.next_prop[0] == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
|
||||
p.raw.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB3d */
|
||||
if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
|
||||
p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB4 */
|
||||
if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
|
||||
p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
|
||||
p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB5 */
|
||||
if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
|
||||
(p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB6 */
|
||||
if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
|
||||
(p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
|
||||
(p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER ||
|
||||
p.skip.next_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
|
||||
p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB7 */
|
||||
if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
|
||||
(p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
|
||||
(p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER ||
|
||||
p.skip.prev_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
|
||||
p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB7a */
|
||||
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB7b */
|
||||
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
|
||||
p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB7c */
|
||||
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
|
||||
p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB8 */
|
||||
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB9 */
|
||||
if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB10 */
|
||||
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
|
||||
(p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB11 */
|
||||
if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
|
||||
p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB12 */
|
||||
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
|
||||
(p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
|
||||
p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB13 */
|
||||
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB13a */
|
||||
if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA ||
|
||||
p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB13b */
|
||||
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
|
||||
(p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC ||
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB15 and WB16 */
|
||||
if (!state.ri_even &&
|
||||
p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* WB999 */
|
||||
break;
|
||||
}
|
||||
|
||||
return herodotus_reader_number_read(&(p.mid_reader));
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_next_word_break(const uint_least32_t *str, size_t len)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
|
||||
|
||||
return next_word_break(&r);
|
||||
}
|
||||
|
||||
size_t
|
||||
grapheme_next_word_break_utf8(const char *str, size_t len)
|
||||
{
|
||||
HERODOTUS_READER r;
|
||||
|
||||
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
|
||||
|
||||
return next_word_break(&r);
|
||||
}
|
BIN
libs/libgrapheme-2.0.2/src/word.o
Normal file
BIN
libs/libgrapheme-2.0.2/src/word.o
Normal file
Binary file not shown.
Reference in New Issue
Block a user