Initial Commit

This commit is contained in:
2025-08-30 16:07:19 +01:00
commit d86c15e30c
169 changed files with 121377 additions and 0 deletions

View File

@@ -0,0 +1,470 @@
/* See LICENSE file for copyright and license details. */
#include <stddef.h>
#include <stdint.h>
#include "../grapheme.h"
#include "../gen/case.h"
#include "util.h"
static inline enum case_property
get_case_property(uint_least32_t cp)
{
if (likely(cp <= UINT32_C(0x10FFFF))) {
return (enum case_property)
case_minor[case_major[cp >> 8] + (cp & 0xFF)];
} else {
return CASE_PROP_OTHER;
}
}
static inline int_least32_t
get_case_offset(uint_least32_t cp, const uint_least16_t *major,
const int_least32_t *minor)
{
if (likely(cp <= UINT32_C(0x10FFFF))) {
/*
* this value might be larger than or equal to 0x110000
* for the special-case-mapping. This needs to be handled
* separately
*/
return minor[major[cp >> 8] + (cp & 0xFF)];
} else {
return 0;
}
}
static inline size_t
to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
uint_least8_t final_sigma_level, const uint_least16_t *major,
const int_least32_t *minor, const struct special_case *sc)
{
HERODOTUS_READER tmp;
enum case_property prop;
enum herodotus_status s;
size_t off, i;
uint_least32_t cp, tmp_cp;
int_least32_t map;
for (; herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;) {
if (sc == lower_special) {
/*
* For the special Final_Sigma-rule (see SpecialCasing.txt),
* which is the only non-localized case-dependent rule,
* we apply a different mapping when a sigma is at the
* end of a word.
*
* Before: cased case-ignorable*
* After: not(case-ignorable* cased)
*
* We check the after-condition on demand, but the before-
* condition is best checked using the "level"-heuristic
* also used in the sentence and line breaking-implementations.
*/
if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL LETTER SIGMA */
(final_sigma_level == 1 ||
final_sigma_level == 2)) {
/*
* check succeeding characters by first skipping
* all case-ignorable characters and then checking
* if the succeeding character is cased, invalidating
* the after-condition
*/
herodotus_reader_copy(r, &tmp);
for (prop = NUM_CASE_PROPS;
(s = herodotus_read_codepoint(&tmp, true, &tmp_cp)) ==
HERODOTUS_STATUS_SUCCESS; ) {
prop = get_case_property(tmp_cp);
if (prop != CASE_PROP_CASE_IGNORABLE &&
prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
break;
}
}
/*
* Now prop is something other than case-ignorable or
* the source-string ended.
* If it is something other than cased, we know
* that the after-condition holds
*/
if (s != HERODOTUS_STATUS_SUCCESS ||
(prop != CASE_PROP_CASED &&
prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
/*
* write GREEK SMALL LETTER FINAL SIGMA to
* destination
*/
herodotus_write_codepoint(w, UINT32_C(0x03C2));
/* reset Final_Sigma-state and continue */
final_sigma_level = 0;
continue;
}
}
/* update state */
prop = get_case_property(cp);
if ((final_sigma_level == 0 ||
final_sigma_level == 1) &&
(prop == CASE_PROP_CASED ||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
/* sequence has begun */
final_sigma_level = 1;
} else if ((final_sigma_level == 1 ||
final_sigma_level == 2) &&
(prop == CASE_PROP_CASE_IGNORABLE ||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
/* case-ignorable sequence begins or continued */
final_sigma_level = 2;
} else {
/* sequence broke */
final_sigma_level = 0;
}
}
/* get and handle case mapping */
if (unlikely((map = get_case_offset(cp, major, minor)) >=
INT32_C(0x110000))) {
/* we have a special case and the offset in the sc-array
* is the difference to 0x110000*/
off = (uint_least32_t)map - UINT32_C(0x110000);
for (i = 0; i < sc[off].cplen; i++) {
herodotus_write_codepoint(w, sc[off].cp[i]);
}
} else {
/* we have a simple mapping */
herodotus_write_codepoint(w, (uint_least32_t)
((int_least32_t)cp + map));
}
}
herodotus_writer_nul_terminate(w);
return herodotus_writer_number_written(w);
}
static size_t
herodotus_next_word_break(const HERODOTUS_READER *r)
{
HERODOTUS_READER tmp;
herodotus_reader_copy(r, &tmp);
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
return grapheme_next_word_break(tmp.src, tmp.srclen);
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
}
}
static inline size_t
to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
{
enum case_property prop;
enum herodotus_status s;
uint_least32_t cp;
size_t nwb;
for (; (nwb = herodotus_next_word_break(r)) > 0;) {
herodotus_reader_push_advance_limit(r, nwb);
for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
/* check if we have a cased character */
prop = get_case_property(cp);
if (prop == CASE_PROP_CASED ||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
break;
} else {
/* write the data to the output verbatim, it if permits */
herodotus_write_codepoint(w, cp);
/* increment reader */
herodotus_read_codepoint(r, true, &cp);
}
}
if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
/* we are done */
herodotus_reader_pop_limit(r);
break;
} else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
/*
* we did not encounter any cased character
* up to the word break
*/
herodotus_reader_pop_limit(r);
continue;
} else {
/*
* we encountered a cased character before the word
* break, convert it to titlecase
*/
herodotus_reader_push_advance_limit(r,
herodotus_reader_next_codepoint_break(r));
to_case(r, w, 0, title_major, title_minor, title_special);
herodotus_reader_pop_limit(r);
}
/* cast the rest of the codepoints in the word to lowercase */
to_case(r, w, 1, lower_major, lower_minor, lower_special);
/* remove the limit on the word before the next iteration */
herodotus_reader_pop_limit(r);
}
herodotus_writer_nul_terminate(w);
return herodotus_writer_number_written(w);
}
size_t
grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
{
HERODOTUS_READER r;
HERODOTUS_WRITER w;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
}
size_t
grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
{
HERODOTUS_READER r;
HERODOTUS_WRITER w;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
}
size_t
grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
{
HERODOTUS_READER r;
HERODOTUS_WRITER w;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
return to_titlecase(&r, &w);
}
size_t
grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
{
HERODOTUS_READER r;
HERODOTUS_WRITER w;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
}
size_t
grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
{
HERODOTUS_READER r;
HERODOTUS_WRITER w;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
}
size_t
grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
{
HERODOTUS_READER r;
HERODOTUS_WRITER w;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
return to_titlecase(&r, &w);
}
static inline bool
is_case(HERODOTUS_READER *r, const uint_least16_t *major,
const int_least32_t *minor, const struct special_case *sc,
size_t *output)
{
size_t off, i;
bool ret = true;
uint_least32_t cp;
int_least32_t map;
for (; herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;) {
/* get and handle case mapping */
if (unlikely((map = get_case_offset(cp, major, minor)) >=
INT32_C(0x110000))) {
/* we have a special case and the offset in the sc-array
* is the difference to 0x110000*/
off = (uint_least32_t)map - UINT32_C(0x110000);
for (i = 0; i < sc[off].cplen; i++) {
if (herodotus_read_codepoint(r, false, &cp) ==
HERODOTUS_STATUS_SUCCESS) {
if (cp != sc[off].cp[i]) {
ret = false;
goto done;
} else {
/* move forward */
herodotus_read_codepoint(r, true, &cp);
}
} else {
/*
* input ended and we didn't see
* any difference so far, so this
* string is in fact okay
*/
ret = true;
goto done;
}
}
} else {
/* we have a simple mapping */
if (cp != (uint_least32_t)((int_least32_t)cp + map)) {
/* we have a difference */
ret = false;
goto done;
} else {
/* move forward */
herodotus_read_codepoint(r, true, &cp);
}
}
}
done:
if (output) {
*output = herodotus_reader_number_read(r);
}
return ret;
}
static inline bool
is_titlecase(HERODOTUS_READER *r, size_t *output)
{
enum case_property prop;
enum herodotus_status s;
bool ret = true;
uint_least32_t cp;
size_t nwb;
for (; (nwb = herodotus_next_word_break(r)) > 0;) {
herodotus_reader_push_advance_limit(r, nwb);
for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
/* check if we have a cased character */
prop = get_case_property(cp);
if (prop == CASE_PROP_CASED ||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
break;
} else {
/* increment reader */
herodotus_read_codepoint(r, true, &cp);
}
}
if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
/* we are done */
break;
} else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
/*
* we did not encounter any cased character
* up to the word break
*/
herodotus_reader_pop_limit(r);
continue;
} else {
/*
* we encountered a cased character before the word
* break, check if it's titlecase
*/
herodotus_reader_push_advance_limit(r,
herodotus_reader_next_codepoint_break(r));
if (!is_case(r, title_major, title_minor, title_special, NULL)) {
ret = false;
goto done;
}
herodotus_reader_pop_limit(r);
}
/* check if the rest of the codepoints in the word are lowercase */
if (!is_case(r, lower_major, lower_minor, lower_special, NULL)) {
ret = false;
goto done;
}
/* remove the limit on the word before the next iteration */
herodotus_reader_pop_limit(r);
}
done:
if (output) {
*output = herodotus_reader_number_read(r);
}
return ret;
}
bool
grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
return is_case(&r, upper_major, upper_minor, upper_special, caselen);
}
bool
grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
return is_case(&r, lower_major, lower_minor, lower_special, caselen);
}
bool
grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
return is_titlecase(&r, caselen);
}
bool
grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
return is_case(&r, upper_major, upper_minor, upper_special, caselen);
}
bool
grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
return is_case(&r, lower_major, lower_minor, lower_special, caselen);
}
bool
grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
return is_titlecase(&r, caselen);
}

Binary file not shown.

View File

@@ -0,0 +1,243 @@
/* See LICENSE file for copyright and license details. */
#include <limits.h>
#include <stdbool.h>
#include <stddef.h>
#include "../gen/character.h"
#include "../grapheme.h"
#include "util.h"
struct character_break_state {
uint_least8_t prop;
bool prop_set;
bool gb11_flag;
bool gb12_13_flag;
};
static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_OTHER] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_CR] =
UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */
[CHAR_BREAK_PROP_EXTEND] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_L] =
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_V] =
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_T] =
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_LV] =
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_HANGUL_LVT] =
UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_PREPEND] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */
(UINT16_C(0xFFFF) &
~(UINT16_C(1) << CHAR_BREAK_PROP_CR |
UINT16_C(1) << CHAR_BREAK_PROP_LF |
UINT16_C(1) << CHAR_BREAK_PROP_CONTROL
)
), /* GB9b */
[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_SPACINGMARK] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
[CHAR_BREAK_PROP_ZWJ] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
};
static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
[CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ,
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
};
static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
};
static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
};
static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
[CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
};
static inline enum char_break_property
get_break_prop(uint_least32_t cp)
{
if (likely(cp <= UINT32_C(0x10FFFF))) {
return (enum char_break_property)
char_break_minor[char_break_major[cp >> 8] + (cp & 0xFF)];
} else {
return CHAR_BREAK_PROP_OTHER;
}
}
static inline void
state_serialize(const struct character_break_state *in, uint_least16_t *out)
{
*out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */
(uint_least16_t)(((uint_least16_t)(in->prop_set)) << 8) | /* 9th bit */
(uint_least16_t)(((uint_least16_t)(in->gb11_flag)) << 9) | /* 10th bit */
(uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) << 10); /* 11th bit */
}
static inline void
state_deserialize(uint_least16_t in, struct character_break_state *out)
{
out->prop = in & UINT8_C(0xFF);
out->prop_set = in & (UINT16_C(1) << 8);
out->gb11_flag = in & (UINT16_C(1) << 9);
out->gb12_13_flag = in & (UINT16_C(1) << 10);
}
bool
grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, uint_least16_t *s)
{
struct character_break_state state;
enum char_break_property cp0_prop, cp1_prop;
bool notbreak = false;
if (likely(s)) {
state_deserialize(*s, &state);
if (likely(state.prop_set)) {
cp0_prop = state.prop;
} else {
cp0_prop = get_break_prop(cp0);
}
cp1_prop = get_break_prop(cp1);
/* preserve prop of right codepoint for next iteration */
state.prop = (uint_least8_t)cp1_prop;
state.prop_set = true;
/* update flags */
state.gb11_flag =
flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
state.gb11_flag] &
UINT16_C(1) << cp1_prop;
state.gb12_13_flag =
flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS *
state.gb12_13_flag] &
UINT16_C(1) << cp1_prop;
/*
* Apply grapheme cluster breaking algorithm (UAX #29), see
* http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
*/
notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
(dont_break_gb11[cp0_prop + state.gb11_flag *
NUM_CHAR_BREAK_PROPS] &
(UINT16_C(1) << cp1_prop)) ||
(dont_break_gb12_13[cp0_prop + state.gb12_13_flag *
NUM_CHAR_BREAK_PROPS] &
(UINT16_C(1) << cp1_prop));
/* update or reset flags (when we have a break) */
if (likely(!notbreak)) {
state.gb11_flag = state.gb12_13_flag = false;
}
state_serialize(&state, s);
} else {
cp0_prop = get_break_prop(cp0);
cp1_prop = get_break_prop(cp1);
/*
* Apply grapheme cluster breaking algorithm (UAX #29), see
* http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
*
* Given we have no state, this behaves as if the state-booleans
* were all set to false
*/
notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
(dont_break_gb11[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
(dont_break_gb12_13[cp0_prop] & (UINT16_C(1) << cp1_prop));
}
return !notbreak;
}
static size_t
next_character_break(HERODOTUS_READER *r)
{
uint_least16_t state = 0;
uint_least32_t cp0 = 0, cp1 = 0;
for (herodotus_read_codepoint(r, true, &cp0);
herodotus_read_codepoint(r, false, &cp1) == HERODOTUS_STATUS_SUCCESS;
herodotus_read_codepoint(r, true, &cp0)) {
if (grapheme_is_character_break(cp0, cp1, &state)) {
break;
}
}
return herodotus_reader_number_read(r);
}
size_t
grapheme_next_character_break(const uint_least32_t *str, size_t len)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
return next_character_break(&r);
}
size_t
grapheme_next_character_break_utf8(const char *str, size_t len)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
return next_character_break(&r);
}

Binary file not shown.

View File

@@ -0,0 +1,510 @@
/* See LICENSE file for copyright and license details. */
#include <stdbool.h>
#include <stddef.h>
#include "../gen/line.h"
#include "../grapheme.h"
#include "util.h"
static inline enum line_break_property
get_break_prop(uint_least32_t cp)
{
if (likely(cp <= UINT32_C(0x10FFFF))) {
return (enum line_break_property)
line_break_minor[line_break_major[cp >> 8] + (cp & 0xff)];
} else {
return LINE_BREAK_PROP_AL;
}
}
static size_t
next_line_break(HERODOTUS_READER *r)
{
HERODOTUS_READER tmp;
enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop,
last_non_sp_prop, last_non_sp_cm_or_zwj_prop;
uint_least32_t cp;
uint_least8_t lb25_level = 0;
bool lb21a_flag = false, ri_even = true;
/*
* Apply line breaking algorithm (UAX #14), see
* https://unicode.org/reports/tr14/#Algorithm and tailoring
* https://unicode.org/reports/tr14/#Examples (example 7),
* given the automatic test-cases implement this example for
* better number handling.
*
*/
/*
* Initialize the different properties such that we have
* a good state after the state-update in the loop
*/
last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS;
for (herodotus_read_codepoint(r, true, &cp), cp0_prop = get_break_prop(cp);
herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;
herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) {
/* get property of the right codepoint */
cp1_prop = get_break_prop(cp);
/* update retention-states */
/*
* store the last observed non-CM-or-ZWJ-property for
* LB9 and following.
*/
if (cp0_prop != LINE_BREAK_PROP_CM &&
cp0_prop != LINE_BREAK_PROP_ZWJ) {
/*
* check if the property we are overwriting now is an
* HL. If so, we set the LB21a-flag which depends on this
* knowledge.
*/
lb21a_flag = (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL);
/* check regional indicator state */
if (cp0_prop == LINE_BREAK_PROP_RI) {
/*
* The property we just shifted in is
* a regional indicator, increasing the
* number of consecutive RIs on the left
* side of the breakpoint by one, changing
* the oddness.
*
*/
ri_even = !ri_even;
} else {
/*
* We saw no regional indicator, so the
* number of consecutive RIs on the left
* side of the breakpoint is zero, which
* is an even number.
*
*/
ri_even = true;
}
/*
* Here comes a bit of magic. The tailored rule
* LB25 (using example 7) has a very complicated
* left-hand-side-rule of the form
*
* NU (NU | SY | IS)* (CL | CP)?
*
* but instead of backtracking, we keep the state
* as some kind of "power level" in the variable
*
* lb25_level
*
* that goes from 0 to 3
*
* 0: we are not in the sequence
* 1: we have one NU to the left of the middle
* spot
* 2: we have one NU and one or more (NU | SY | IS)
* to the left of the middle spot
* 3: we have one NU, zero or more (NU | SY | IS)
* and one (CL | CP) to the left of the middle
* spot
*/
if ((lb25_level == 0 ||
lb25_level == 1) &&
cp0_prop == LINE_BREAK_PROP_NU) {
/* sequence has begun */
lb25_level = 1;
} else if ((lb25_level == 1 || lb25_level == 2) &&
(cp0_prop == LINE_BREAK_PROP_NU ||
cp0_prop == LINE_BREAK_PROP_SY ||
cp0_prop == LINE_BREAK_PROP_IS)) {
/* (NU | SY | IS) sequence begins or continued */
lb25_level = 2;
} else if ((lb25_level == 1 || lb25_level == 2) &&
(cp0_prop == LINE_BREAK_PROP_CL ||
cp0_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
/* CL or CP at the end of the sequence */
lb25_level = 3;
} else {
/* sequence broke */
lb25_level = 0;
}
last_non_cm_or_zwj_prop = cp0_prop;
}
/*
* store the last observed non-SP-property for LB8, LB14,
* LB15, LB16 and LB17. LB8 gets its own unskipped property,
* whereas the others build on top of the CM-ZWJ-skipped
* properties as they come after LB9
*/
if (cp0_prop != LINE_BREAK_PROP_SP) {
last_non_sp_prop = cp0_prop;
}
if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) {
last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_prop;
}
/* apply the algorithm */
/* LB4 */
if (cp0_prop == LINE_BREAK_PROP_BK) {
break;
}
/* LB5 */
if (cp0_prop == LINE_BREAK_PROP_CR &&
cp1_prop == LINE_BREAK_PROP_LF) {
continue;
}
if (cp0_prop == LINE_BREAK_PROP_CR ||
cp0_prop == LINE_BREAK_PROP_LF ||
cp0_prop == LINE_BREAK_PROP_NL) {
break;
}
/* LB6 */
if (cp1_prop == LINE_BREAK_PROP_BK ||
cp1_prop == LINE_BREAK_PROP_CR ||
cp1_prop == LINE_BREAK_PROP_LF ||
cp1_prop == LINE_BREAK_PROP_NL) {
continue;
}
/* LB7 */
if (cp1_prop == LINE_BREAK_PROP_SP ||
cp1_prop == LINE_BREAK_PROP_ZW) {
continue;
}
/* LB8 */
if (last_non_sp_prop == LINE_BREAK_PROP_ZW) {
break;
}
/* LB8a */
if (cp0_prop == LINE_BREAK_PROP_ZWJ) {
continue;
}
/* LB9 */
if ((cp0_prop != LINE_BREAK_PROP_BK &&
cp0_prop != LINE_BREAK_PROP_CR &&
cp0_prop != LINE_BREAK_PROP_LF &&
cp0_prop != LINE_BREAK_PROP_NL &&
cp0_prop != LINE_BREAK_PROP_SP &&
cp0_prop != LINE_BREAK_PROP_ZW) &&
(cp1_prop == LINE_BREAK_PROP_CM ||
cp1_prop == LINE_BREAK_PROP_ZWJ)) {
/*
* given we skip them, we don't break in such
* a sequence
*/
continue;
}
/* LB10 is baked into the following rules */
/* LB11 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ ||
cp1_prop == LINE_BREAK_PROP_WJ) {
continue;
}
/* LB12 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) {
continue;
}
/* LB12a */
if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP &&
last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA &&
last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) &&
cp1_prop == LINE_BREAK_PROP_GL) {
continue;
}
/* LB13 (affected by tailoring for LB25, see example 7) */
if (cp1_prop == LINE_BREAK_PROP_EX ||
(last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU &&
(cp1_prop == LINE_BREAK_PROP_CL ||
cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_IS ||
cp1_prop == LINE_BREAK_PROP_SY))) {
continue;
}
/* LB14 */
if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF) {
continue;
}
/* LB15 */
if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU &&
(cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) {
continue;
}
/* LB16 */
if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL ||
last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF) &&
cp1_prop == LINE_BREAK_PROP_NS) {
continue;
}
/* LB17 */
if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 &&
cp1_prop == LINE_BREAK_PROP_B2) {
continue;
}
/* LB18 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) {
break;
}
/* LB19 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU ||
cp1_prop == LINE_BREAK_PROP_QU) {
continue;
}
/* LB20 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB ||
cp1_prop == LINE_BREAK_PROP_CB) {
break;
}
/* LB21 */
if (cp1_prop == LINE_BREAK_PROP_BA ||
cp1_prop == LINE_BREAK_PROP_HY ||
cp1_prop == LINE_BREAK_PROP_NS ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) {
continue;
}
/* LB21a */
if (lb21a_flag &&
(last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) {
continue;
}
/* LB21b */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY &&
cp1_prop == LINE_BREAK_PROP_HL) {
continue;
}
/* LB22 */
if (cp1_prop == LINE_BREAK_PROP_IN) {
continue;
}
/* LB23 */
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
cp1_prop == LINE_BREAK_PROP_NU) {
continue;
}
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU &&
(cp1_prop == LINE_BREAK_PROP_AL ||
cp1_prop == LINE_BREAK_PROP_HL)) {
continue;
}
/* LB23a */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
(cp1_prop == LINE_BREAK_PROP_ID ||
cp1_prop == LINE_BREAK_PROP_EB ||
cp1_prop == LINE_BREAK_PROP_EM)) {
continue;
}
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) &&
cp1_prop == LINE_BREAK_PROP_PO) {
continue;
}
/* LB24 */
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) &&
(cp1_prop == LINE_BREAK_PROP_AL ||
cp1_prop == LINE_BREAK_PROP_HL)) {
continue;
}
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
(cp1_prop == LINE_BREAK_PROP_PR ||
cp1_prop == LINE_BREAK_PROP_PO)) {
continue;
}
/* LB25 (tailored with example 7) */
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) {
if (cp1_prop == LINE_BREAK_PROP_NU) {
continue;
}
/* this stupid rule is the reason why we cannot
* simply have a stateful break-detection between
* two adjacent codepoints as we have it with
* characters.
*/
herodotus_reader_copy(r, &tmp);
herodotus_read_codepoint(&tmp, true, &cp);
if (herodotus_read_codepoint(&tmp, true, &cp) ==
HERODOTUS_STATUS_SUCCESS &&
(cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_HY)) {
if (get_break_prop(cp) == LINE_BREAK_PROP_NU) {
continue;
}
}
}
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) &&
cp1_prop == LINE_BREAK_PROP_NU) {
continue;
}
if (lb25_level == 1 &&
(cp1_prop == LINE_BREAK_PROP_NU ||
cp1_prop == LINE_BREAK_PROP_SY ||
cp1_prop == LINE_BREAK_PROP_IS)) {
continue;
}
if ((lb25_level == 1 || lb25_level == 2) &&
(cp1_prop == LINE_BREAK_PROP_NU ||
cp1_prop == LINE_BREAK_PROP_SY ||
cp1_prop == LINE_BREAK_PROP_IS ||
cp1_prop == LINE_BREAK_PROP_CL ||
cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
continue;
}
if ((lb25_level == 1 || lb25_level == 2 || lb25_level == 3) &&
(cp1_prop == LINE_BREAK_PROP_PO ||
cp1_prop == LINE_BREAK_PROP_PR)) {
continue;
}
/* LB26 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL &&
(cp1_prop == LINE_BREAK_PROP_JL ||
cp1_prop == LINE_BREAK_PROP_JV ||
cp1_prop == LINE_BREAK_PROP_H2 ||
cp1_prop == LINE_BREAK_PROP_H3)) {
continue;
}
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) &&
(cp1_prop == LINE_BREAK_PROP_JV ||
cp1_prop == LINE_BREAK_PROP_JT)) {
continue;
}
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
cp1_prop == LINE_BREAK_PROP_JT) {
continue;
}
/* LB27 */
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
cp1_prop == LINE_BREAK_PROP_PO) {
continue;
}
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
(cp1_prop == LINE_BREAK_PROP_JL ||
cp1_prop == LINE_BREAK_PROP_JV ||
cp1_prop == LINE_BREAK_PROP_JT ||
cp1_prop == LINE_BREAK_PROP_H2 ||
cp1_prop == LINE_BREAK_PROP_H3)) {
continue;
}
/* LB28 */
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
(cp1_prop == LINE_BREAK_PROP_AL ||
cp1_prop == LINE_BREAK_PROP_HL)) {
continue;
}
/* LB29 */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS &&
(cp1_prop == LINE_BREAK_PROP_AL ||
cp1_prop == LINE_BREAK_PROP_HL)) {
continue;
}
/* LB30 */
if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL ||
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) &&
cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) {
continue;
}
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF &&
(cp1_prop == LINE_BREAK_PROP_AL ||
cp1_prop == LINE_BREAK_PROP_HL ||
cp1_prop == LINE_BREAK_PROP_NU)) {
continue;
}
/* LB30a */
if (!ri_even &&
last_non_cm_or_zwj_prop == LINE_BREAK_PROP_RI &&
cp1_prop == LINE_BREAK_PROP_RI) {
continue;
}
/* LB30b */
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB &&
cp1_prop == LINE_BREAK_PROP_EM) {
continue;
}
if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BOTH_CN_EXTPICT &&
cp1_prop == LINE_BREAK_PROP_EM) {
continue;
}
/* LB31 */
break;
}
return herodotus_reader_number_read(r);
}
size_t
grapheme_next_line_break(const uint_least32_t *str, size_t len)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
return next_line_break(&r);
}
size_t
grapheme_next_line_break_utf8(const char *str, size_t len)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
return next_line_break(&r);
}

Binary file not shown.

View File

@@ -0,0 +1,282 @@
/* See LICENSE file for copyright and license details. */
#include <stdbool.h>
#include <stddef.h>
#include "../gen/sentence.h"
#include "../grapheme.h"
#include "util.h"
struct sentence_break_state
{
uint_least8_t aterm_close_sp_level;
uint_least8_t saterm_close_sp_parasep_level;
};
static inline uint_least8_t
get_sentence_break_prop(uint_least32_t cp)
{
if (likely(cp <= UINT32_C(0x10FFFF))) {
return (uint_least8_t)
sentence_break_minor[sentence_break_major[cp >> 8] +
(cp & 0xff)];
} else {
return SENTENCE_BREAK_PROP_OTHER;
}
}
static bool
is_skippable_sentence_prop(uint_least8_t prop)
{
return prop == SENTENCE_BREAK_PROP_EXTEND ||
prop == SENTENCE_BREAK_PROP_FORMAT;
}
static void
sentence_skip_shift_callback(uint_least8_t prop, void *s)
{
struct sentence_break_state *state = (struct sentence_break_state *)s;
/*
* Here comes a bit of magic. The rules
* SB8, SB8a, SB9 and SB10 have very complicated
* left-hand-side-rules of the form
*
* ATerm Close* Sp*
* SATerm Close*
* SATerm Close* Sp*
* SATerm Close* Sp* ParaSep?
*
* but instead of backtracking, we keep the
* state as some kind of "power level" in
* two state-variables
*
* aterm_close_sp_level
* saterm_close_sp_parasep_level
*
* that go from 0 to 3/4:
*
* 0: we are not in the sequence
* 1: we have one ATerm/SATerm to the left of
* the middle spot
* 2: we have one ATerm/SATerm and one or more
* Close to the left of the middle spot
* 3: we have one ATerm/SATerm, zero or more
* Close and one or more Sp to the left of
* the middle spot.
* 4: we have one SATerm, zero or more Close,
* zero or more Sp and one ParaSep to the
* left of the middle spot.
*
*/
if ((state->aterm_close_sp_level == 0 ||
state->aterm_close_sp_level == 1) &&
prop == SENTENCE_BREAK_PROP_ATERM) {
/* sequence has begun */
state->aterm_close_sp_level = 1;
} else if ((state->aterm_close_sp_level == 1 ||
state->aterm_close_sp_level == 2) &&
prop == SENTENCE_BREAK_PROP_CLOSE) {
/* close-sequence begins or continued */
state->aterm_close_sp_level = 2;
} else if ((state->aterm_close_sp_level == 1 ||
state->aterm_close_sp_level == 2 ||
state->aterm_close_sp_level == 3) &&
prop == SENTENCE_BREAK_PROP_SP) {
/* sp-sequence begins or continued */
state->aterm_close_sp_level = 3;
} else {
/* sequence broke */
state->aterm_close_sp_level = 0;
}
if ((state->saterm_close_sp_parasep_level == 0 ||
state->saterm_close_sp_parasep_level == 1) &&
(prop == SENTENCE_BREAK_PROP_STERM ||
prop == SENTENCE_BREAK_PROP_ATERM)) {
/* sequence has begun */
state->saterm_close_sp_parasep_level = 1;
} else if ((state->saterm_close_sp_parasep_level == 1 ||
state->saterm_close_sp_parasep_level == 2) &&
prop == SENTENCE_BREAK_PROP_CLOSE) {
/* close-sequence begins or continued */
state->saterm_close_sp_parasep_level = 2;
} else if ((state->saterm_close_sp_parasep_level == 1 ||
state->saterm_close_sp_parasep_level == 2 ||
state->saterm_close_sp_parasep_level == 3) &&
prop == SENTENCE_BREAK_PROP_SP) {
/* sp-sequence begins or continued */
state->saterm_close_sp_parasep_level = 3;
} else if ((state->saterm_close_sp_parasep_level == 1 ||
state->saterm_close_sp_parasep_level == 2 ||
state->saterm_close_sp_parasep_level == 3) &&
(prop == SENTENCE_BREAK_PROP_SEP ||
prop == SENTENCE_BREAK_PROP_CR ||
prop == SENTENCE_BREAK_PROP_LF)) {
/* ParaSep at the end of the sequence */
state->saterm_close_sp_parasep_level = 4;
} else {
/* sequence broke */
state->saterm_close_sp_parasep_level = 0;
}
}
static size_t
next_sentence_break(HERODOTUS_READER *r)
{
HERODOTUS_READER tmp;
enum sentence_break_property prop;
struct proper p;
struct sentence_break_state state = { 0 };
uint_least32_t cp;
/*
* Apply sentence breaking algorithm (UAX #29), see
* https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
*/
proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
get_sentence_break_prop, is_skippable_sentence_prop,
sentence_skip_shift_callback, &p);
while (!proper_advance(&p)) {
/* SB3 */
if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
continue;
}
/* SB4 */
if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR ||
p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
break;
}
/* SB5 */
if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
continue;
}
/* SB6 */
if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
continue;
}
/* SB7 */
if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
continue;
}
/* SB8 */
if (state.aterm_close_sp_level == 1 ||
state.aterm_close_sp_level == 2 ||
state.aterm_close_sp_level == 3) {
/*
* This is the most complicated rule, requiring
* the right-hand-side to satisfy the regular expression
*
* ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
*
* which we simply check "manually" given LUT-lookups
* are very cheap by starting at the mid_reader.
*
*/
herodotus_reader_copy(&(p.mid_reader), &tmp);
prop = NUM_SENTENCE_BREAK_PROPS;
while (herodotus_read_codepoint(&tmp, true, &cp) ==
HERODOTUS_STATUS_SUCCESS) {
prop = get_sentence_break_prop(cp);
/*
* the skippable properties are ignored
* automatically here given they do not
* match the following condition
*/
if (prop == SENTENCE_BREAK_PROP_OLETTER ||
prop == SENTENCE_BREAK_PROP_UPPER ||
prop == SENTENCE_BREAK_PROP_LOWER ||
prop == SENTENCE_BREAK_PROP_SEP ||
prop == SENTENCE_BREAK_PROP_CR ||
prop == SENTENCE_BREAK_PROP_LF ||
prop == SENTENCE_BREAK_PROP_STERM ||
prop == SENTENCE_BREAK_PROP_ATERM) {
break;
}
}
if (prop == SENTENCE_BREAK_PROP_LOWER) {
continue;
}
}
/* SB8a */
if ((state.saterm_close_sp_parasep_level == 1 ||
state.saterm_close_sp_parasep_level == 2 ||
state.saterm_close_sp_parasep_level == 3) &&
(p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
continue;
}
/* SB9 */
if ((state.saterm_close_sp_parasep_level == 1 ||
state.saterm_close_sp_parasep_level == 2) &&
(p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
continue;
}
/* SB10 */
if ((state.saterm_close_sp_parasep_level == 1 ||
state.saterm_close_sp_parasep_level == 2 ||
state.saterm_close_sp_parasep_level == 3) &&
(p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
continue;
}
/* SB11 */
if (state.saterm_close_sp_parasep_level == 1 ||
state.saterm_close_sp_parasep_level == 2 ||
state.saterm_close_sp_parasep_level == 3 ||
state.saterm_close_sp_parasep_level == 4) {
break;
}
/* SB998 */
continue;
}
return herodotus_reader_number_read(&(p.mid_reader));
}
size_t
grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
return next_sentence_break(&r);
}
size_t
grapheme_next_sentence_break_utf8(const char *str, size_t len)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
return next_sentence_break(&r);
}

Binary file not shown.

View File

@@ -0,0 +1,219 @@
/* See LICENSE file for copyright and license details. */
#include <stddef.h>
#include <stdint.h>
#include "../grapheme.h"
#include "util.h"
#define BETWEEN(c, l, u) ((c) >= (l) && (c) <= (u))
/* lookup-table for the types of sequence first bytes */
static const struct {
uint_least8_t lower; /* lower bound of sequence first byte */
uint_least8_t upper; /* upper bound of sequence first byte */
uint_least32_t mincp; /* smallest non-overlong encoded codepoint */
uint_least32_t maxcp; /* largest encodable codepoint */
/*
* implicit: table-offset represents the number of following
* bytes of the form 10xxxxxx (6 bits capacity each)
*/
} lut[] = {
[0] = {
/* 0xxxxxxx */
.lower = 0x00, /* 00000000 */
.upper = 0x7F, /* 01111111 */
.mincp = (uint_least32_t)0,
.maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacity */
},
[1] = {
/* 110xxxxx */
.lower = 0xC0, /* 11000000 */
.upper = 0xDF, /* 11011111 */
.mincp = (uint_least32_t)1 << 7,
.maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
},
[2] = {
/* 1110xxxx */
.lower = 0xE0, /* 11100000 */
.upper = 0xEF, /* 11101111 */
.mincp = (uint_least32_t)1 << 11,
.maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
},
[3] = {
/* 11110xxx */
.lower = 0xF0, /* 11110000 */
.upper = 0xF7, /* 11110111 */
.mincp = (uint_least32_t)1 << 16,
.maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
},
};
size_t
grapheme_decode_utf8(const char *str, size_t len, uint_least32_t *cp)
{
size_t off, i;
uint_least32_t tmp;
if (cp == NULL) {
/*
* instead of checking every time if cp is NULL within
* the decoder, simply point it at a dummy variable here.
*/
cp = &tmp;
}
if (str == NULL || len == 0) {
/* a sequence must be at least 1 byte long */
*cp = GRAPHEME_INVALID_CODEPOINT;
return 0;
}
/* identify sequence type with the first byte */
for (off = 0; off < LEN(lut); off++) {
if (BETWEEN(((const unsigned char *)str)[0], lut[off].lower,
lut[off].upper)) {
/*
* first byte is within the bounds; fill
* p with the the first bits contained in
* the first byte (by subtracting the high bits)
*/
*cp = ((const unsigned char *)str)[0] - lut[off].lower;
break;
}
}
if (off == LEN(lut)) {
/*
* first byte does not match a sequence type;
* set cp as invalid and return 1 byte processed
*
* this also includes the cases where bits higher than
* the 8th are set on systems with CHAR_BIT > 8
*/
*cp = GRAPHEME_INVALID_CODEPOINT;
return 1;
}
if (1 + off > len) {
/*
* input is not long enough, set cp as invalid
*/
*cp = GRAPHEME_INVALID_CODEPOINT;
/*
* count the following continuation bytes, but nothing
* else in case we have a "rogue" case where e.g. such a
* sequence starter occurs right before a NUL-byte.
*/
for (i = 0; 1 + i < len; i++) {
if(!BETWEEN(((const unsigned char *)str)[1 + i],
0x80, 0xBF)) {
break;
}
}
/*
* if the continuation bytes do not continue until
* the end, return the incomplete sequence length.
* Otherwise return the number of bytes we actually
* expected, which is larger than n.
*/
return ((1 + i) < len) ? (1 + i) : (1 + off);
}
/*
* process 'off' following bytes, each of the form 10xxxxxx
* (i.e. between 0x80 (10000000) and 0xBF (10111111))
*/
for (i = 1; i <= off; i++) {
if(!BETWEEN(((const unsigned char *)str)[i], 0x80, 0xBF)) {
/*
* byte does not match format; return
* number of bytes processed excluding the
* unexpected character as recommended since
* Unicode 6 (chapter 3)
*
* this also includes the cases where bits
* higher than the 8th are set on systems
* with CHAR_BIT > 8
*/
*cp = GRAPHEME_INVALID_CODEPOINT;
return 1 + (i - 1);
}
/*
* shift codepoint by 6 bits and add the 6 stored bits
* in s[i] to it using the bitmask 0x3F (00111111)
*/
*cp = (*cp << 6) | (((const unsigned char *)str)[i] & 0x3F);
}
if (*cp < lut[off].mincp ||
BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
*cp > UINT32_C(0x10FFFF)) {
/*
* codepoint is overlong encoded in the sequence, is a
* high or low UTF-16 surrogate half (0xD800..0xDFFF) or
* not representable in UTF-16 (>0x10FFFF) (RFC-3629
* specifies the latter two conditions)
*/
*cp = GRAPHEME_INVALID_CODEPOINT;
}
return 1 + off;
}
size_t
grapheme_encode_utf8(uint_least32_t cp, char *str, size_t len)
{
size_t off, i;
if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
cp > UINT32_C(0x10FFFF)) {
/*
* codepoint is a high or low UTF-16 surrogate half
* (0xD800..0xDFFF) or not representable in UTF-16
* (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
*/
cp = GRAPHEME_INVALID_CODEPOINT;
}
/* determine necessary sequence type */
for (off = 0; off < LEN(lut); off++) {
if (cp <= lut[off].maxcp) {
break;
}
}
if (1 + off > len || str == NULL || len == 0) {
/*
* specified buffer is too small to store sequence or
* the caller just wanted to know how many bytes the
* codepoint needs by passing a NULL-buffer.
*/
return 1 + off;
}
/* build sequence by filling cp-bits into each byte */
/*
* lut[off].lower is the bit-format for the first byte and
* the bits to fill into it are determined by shifting the
* cp 6 times the number of following bytes, as each
* following byte stores 6 bits, yielding the wanted bits.
*
* We do not overwrite the mask because we guaranteed earlier
* that there are no bits higher than the mask allows.
*/
((unsigned char *)str)[0] = lut[off].lower |
(uint_least8_t)(cp >> (6 * off));
for (i = 1; i <= off; i++) {
/*
* the bit-format for following bytes is 10000000 (0x80)
* and it each stores 6 bits in the 6 low bits that we
* extract from the properly-shifted value using the
* mask 00111111 (0x3F)
*/
((unsigned char *)str)[i] = 0x80 |
((cp >> (6 * (off - i))) & 0x3F);
}
return 1 + off;
}

Binary file not shown.

View File

@@ -0,0 +1,417 @@
/* See LICENSE file for copyright and license details. */
#include <limits.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "../gen/types.h"
#include "../grapheme.h"
#include "util.h"
void
herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type,
const void *src, size_t srclen)
{
size_t i;
r->type = type;
r->src = src;
r->srclen = srclen;
r->off = 0;
r->terminated_by_null = false;
for (i = 0; i < LEN(r->soft_limit); i++) {
r->soft_limit[i] = SIZE_MAX;
}
}
void
herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
{
size_t i;
/*
* we copy such that we have a "fresh" start and build on the
* fact that src->soft_limit[i] for any i and src->srclen are
* always larger or equal to src->off
*/
dest->type = src->type;
if (src->type == HERODOTUS_TYPE_CODEPOINT) {
dest->src = (src->src == NULL) ? NULL :
((const uint_least32_t *)(src->src)) + src->off;
} else { /* src->type == HERODOTUS_TYPE_UTF8 */
dest->src = (src->src == NULL) ? NULL :
((const char *)(src->src)) + src->off;
}
if (src->srclen == SIZE_MAX) {
dest->srclen = SIZE_MAX;
} else {
dest->srclen = (src->off < src->srclen) ? src->srclen - src->off : 0;
}
dest->off = 0;
dest->terminated_by_null = src->terminated_by_null;
for (i = 0; i < LEN(src->soft_limit); i++) {
if (src->soft_limit[i] == SIZE_MAX) {
dest->soft_limit[i] = SIZE_MAX;
} else {
/*
* if we have a degenerate case where the offset is
* higher than the soft-limit, we simply clamp the
* soft-limit to zero given we can't decide here
* to release the limit and, instead, we just
* prevent any more reads
*/
dest->soft_limit[i] = (src->off < src->soft_limit[i]) ?
src->soft_limit[i] - src->off : 0;
}
}
}
void
herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count)
{
size_t i;
for (i = LEN(r->soft_limit) - 1; i >= 1; i--) {
r->soft_limit[i] = r->soft_limit[i - 1];
}
r->soft_limit[0] = r->off + count;
}
void
herodotus_reader_pop_limit(HERODOTUS_READER *r)
{
size_t i;
for (i = 0; i < LEN(r->soft_limit) - 1; i++) {
r->soft_limit[i] = r->soft_limit[i + 1];
}
r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX;
}
size_t
herodotus_reader_next_word_break(const HERODOTUS_READER *r)
{
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
return grapheme_next_word_break(
(const uint_least32_t *)(r->src) + r->off,
MIN(r->srclen, r->soft_limit[0]) - r->off);
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
return grapheme_next_word_break_utf8(
(const char *)(r->src) + r->off,
MIN(r->srclen, r->soft_limit[0]) - r->off);
}
}
size_t
herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r)
{
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0;
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
return grapheme_decode_utf8(
(const char *)(r->src) + r->off,
MIN(r->srclen, r->soft_limit[0]) - r->off, NULL);
}
}
size_t
herodotus_reader_number_read(const HERODOTUS_READER *r)
{
return r->off;
}
enum herodotus_status
herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp)
{
size_t ret;
if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) {
*cp = GRAPHEME_INVALID_CODEPOINT;
return HERODOTUS_STATUS_END_OF_BUFFER;
}
if (r->off >= r->soft_limit[0]) {
*cp = GRAPHEME_INVALID_CODEPOINT;
return HERODOTUS_STATUS_SOFT_LIMIT_REACHED;
}
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
*cp = ((const uint_least32_t *)(r->src))[r->off];
ret = 1;
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
ret = grapheme_decode_utf8((const char *)r->src + r->off,
MIN(r->srclen, r->soft_limit[0]) -
r->off, cp);
}
if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) {
/*
* We encountered a null-codepoint. Don't increment
* offset and return as if the buffer had ended here all
* along
*/
r->terminated_by_null = true;
return HERODOTUS_STATUS_END_OF_BUFFER;
}
if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) {
/*
* we want more than we have; instead of returning
* garbage we terminate here.
*/
return HERODOTUS_STATUS_END_OF_BUFFER;
}
/*
* Increase offset which we now know won't surpass the limits,
* unless we got told otherwise
*/
if (advance) {
r->off += ret;
}
return HERODOTUS_STATUS_SUCCESS;
}
void
herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type,
void *dest, size_t destlen)
{
w->type = type;
w->dest = dest;
w->destlen = destlen;
w->off = 0;
w->first_unwritable_offset = SIZE_MAX;
}
void
herodotus_writer_nul_terminate(HERODOTUS_WRITER *w)
{
if (w->dest == NULL) {
return;
}
if (w->off < w->destlen) {
/* We still have space in the buffer. Simply use it */
if (w->type == HERODOTUS_TYPE_CODEPOINT) {
((uint_least32_t *)(w->dest))[w->off] = 0;
} else { /* w->type == HERODOTUS_TYPE_UTF8 */
((char *)(w->dest))[w->off] = '\0';
}
} else if (w->first_unwritable_offset < w->destlen) {
/*
* There is no more space in the buffer. However,
* we have noted down the first offset we couldn't
* use to write into the buffer and it's smaller than
* destlen. Thus we bailed writing into the
* destination when a multibyte-codepoint couldn't be
* written. So the last "real" byte might be at
* destlen-4, destlen-3, destlen-2 or destlen-1
* (the last case meaning truncation).
*/
if (w->type == HERODOTUS_TYPE_CODEPOINT) {
((uint_least32_t *)(w->dest))
[w->first_unwritable_offset] = 0;
} else { /* w->type == HERODOTUS_TYPE_UTF8 */
((char *)(w->dest))[w->first_unwritable_offset] = '\0';
}
} else if (w->destlen > 0) {
/*
* In this case, there is no more space in the buffer and
* the last unwritable offset is larger than
* or equal to the destination buffer length. This means
* that we are forced to simply write into the last
* byte.
*/
if (w->type == HERODOTUS_TYPE_CODEPOINT) {
((uint_least32_t *)(w->dest))
[w->destlen - 1] = 0;
} else { /* w->type == HERODOTUS_TYPE_UTF8 */
((char *)(w->dest))[w->destlen - 1] = '\0';
}
}
/* w->off is not incremented in any case */
}
size_t
herodotus_writer_number_written(const HERODOTUS_WRITER *w)
{
return w->off;
}
void
herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
{
size_t ret;
/*
* This function will always faithfully say how many codepoints
* were written, even if the buffer ends. This is used to enable
* truncation detection.
*/
if (w->type == HERODOTUS_TYPE_CODEPOINT) {
if (w->dest != NULL && w->off < w->destlen) {
((uint_least32_t *)(w->dest))[w->off] = cp;
}
w->off += 1;
} else { /* w->type == HERODOTUS_TYPE_UTF8 */
/*
* First determine how many bytes we need to encode the
* codepoint
*/
ret = grapheme_encode_utf8(cp, NULL, 0);
if (w->dest != NULL && w->off + ret < w->destlen) {
/* we still have enough room in the buffer */
grapheme_encode_utf8(cp, (char *)(w->dest) +
w->off, w->destlen - w->off);
} else if (w->first_unwritable_offset == SIZE_MAX) {
/*
* the first unwritable offset has not been
* noted down, so this is the first time we can't
* write (completely) to an offset
*/
w->first_unwritable_offset = w->off;
}
w->off += ret;
}
}
void
proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
uint_least8_t (*get_break_prop)(uint_least32_t),
bool (*is_skippable_prop)(uint_least8_t),
void (*skip_shift_callback)(uint_least8_t, void *),
struct proper *p)
{
uint_least8_t prop;
uint_least32_t cp;
size_t i;
/* set internal variables */
p->state = state;
p->no_prop = no_prop;
p->get_break_prop = get_break_prop;
p->is_skippable_prop = is_skippable_prop;
p->skip_shift_callback = skip_shift_callback;
/*
* Initialize mid-reader, which is basically just there
* to reflect the current position of the viewing-line
*/
herodotus_reader_copy(r, &(p->mid_reader));
/*
* In the initialization, we simply (try to) fill in next_prop.
* If we cannot read in more (due to the buffer ending), we
* fill in the prop as invalid
*/
/*
* initialize the previous properties to have no property
* (given we are at the start of the buffer)
*/
p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
/*
* initialize the next properties
*/
/* initialize the raw reader */
herodotus_reader_copy(r, &(p->raw_reader));
/* fill in the two next raw properties (after no-initialization) */
p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
HERODOTUS_STATUS_SUCCESS; ) {
p->raw.next_prop[i++] = p->get_break_prop(cp);
}
/* initialize the skip reader */
herodotus_reader_copy(r, &(p->skip_reader));
/* fill in the two next skip properties (after no-initialization) */
p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
HERODOTUS_STATUS_SUCCESS; ) {
prop = p->get_break_prop(cp);
if (!p->is_skippable_prop(prop)) {
p->skip.next_prop[i++] = prop;
}
}
}
int
proper_advance(struct proper *p)
{
uint_least8_t prop;
uint_least32_t cp;
/* read in next "raw" property */
if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
HERODOTUS_STATUS_SUCCESS) {
prop = p->get_break_prop(cp);
} else {
prop = p->no_prop;
}
/*
* do a shift-in, unless we find that the property that is to
* be moved past the "raw-viewing-line" (this property is stored
* in p->raw.next_prop[0]) is a no_prop, indicating that
* we are at the end of the buffer.
*/
if (p->raw.next_prop[0] == p->no_prop) {
return 1;
}
/* shift in the properties */
p->raw.prev_prop[1] = p->raw.prev_prop[0];
p->raw.prev_prop[0] = p->raw.next_prop[0];
p->raw.next_prop[0] = p->raw.next_prop[1];
p->raw.next_prop[1] = prop;
/* advance the middle reader viewing-line */
(void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
/* check skippability-property */
if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
/*
* the property that has moved past the "raw-viewing-line"
* (this property is now (after the raw-shift) stored in
* p->raw.prev_prop[0] and guaranteed not to be a no-prop,
* guaranteeing that we won't shift a no-prop past the
* "viewing-line" in the skip-properties) is not a skippable
* property, thus we need to shift the skip property as well.
*/
p->skip.prev_prop[1] = p->skip.prev_prop[0];
p->skip.prev_prop[0] = p->skip.next_prop[0];
p->skip.next_prop[0] = p->skip.next_prop[1];
/*
* call the skip-shift-callback on the property that
* passed the skip-viewing-line (this property is now
* stored in p->skip.prev_prop[0]).
*/
p->skip_shift_callback(p->skip.prev_prop[0], p->state);
/* determine the next shift property */
p->skip.next_prop[1] = p->no_prop;
while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
HERODOTUS_STATUS_SUCCESS) {
prop = p->get_break_prop(cp);
if (!p->is_skippable_prop(prop)) {
p->skip.next_prop[1] = prop;
break;
}
}
}
return 0;
}

View File

@@ -0,0 +1,116 @@
/* See LICENSE file for copyright and license details. */
#ifndef UTIL_H
#define UTIL_H
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "../gen/types.h"
#include "../grapheme.h"
#undef MIN
#define MIN(x,y) ((x) < (y) ? (x) : (y))
#undef LEN
#define LEN(x) (sizeof(x) / sizeof(*(x)))
#undef likely
#undef unlikely
#ifdef __has_builtin
#if __has_builtin(__builtin_expect)
#define likely(expr) __builtin_expect(!!(expr), 1)
#define unlikely(expr) __builtin_expect(!!(expr), 0)
#else
#define likely(expr) (expr)
#define unlikely(expr) (expr)
#endif
#else
#define likely(expr) (expr)
#define unlikely(expr) (expr)
#endif
/*
* Herodotus, the ancient greek historian and geographer,
* was criticized for including legends and other fantastic
* accounts into his works, among others by his contemporary
* Thucydides.
*
* The Herodotus readers and writers are tailored towards the needs
* of the library interface, doing all the dirty work behind the
* scenes. While the reader is relatively faithful in his accounts,
* the Herodotus writer will never fail and always claim to write the
* data. Internally, it only writes as much as it can, and will simply
* keep account of the rest. This way, we can properly signal truncation.
*
* In this sense, explaining the naming, the writer is always a bit
* inaccurate in his accounts.
*
*/
enum herodotus_status {
HERODOTUS_STATUS_SUCCESS,
HERODOTUS_STATUS_END_OF_BUFFER,
HERODOTUS_STATUS_SOFT_LIMIT_REACHED,
};
enum herodotus_type {
HERODOTUS_TYPE_CODEPOINT,
HERODOTUS_TYPE_UTF8,
};
typedef struct herodotus_reader {
enum herodotus_type type;
const void *src;
size_t srclen;
size_t off;
bool terminated_by_null;
size_t soft_limit[10];
} HERODOTUS_READER;
typedef struct herodotus_writer {
enum herodotus_type type;
void *dest;
size_t destlen;
size_t off;
size_t first_unwritable_offset;
} HERODOTUS_WRITER;
struct proper {
/*
* prev_prop[1] prev_prop[0] | next_prop[0] next_prop[1]
*/
struct {
uint_least8_t prev_prop[2];
uint_least8_t next_prop[2];
} raw, skip;
HERODOTUS_READER mid_reader, raw_reader, skip_reader;
void *state;
uint_least8_t no_prop;
uint_least8_t (*get_break_prop)(uint_least32_t);
bool (*is_skippable_prop)(uint_least8_t);
void (*skip_shift_callback)(uint_least8_t, void *);
};
void herodotus_reader_init(HERODOTUS_READER *, enum herodotus_type,
const void *, size_t);
void herodotus_reader_copy(const HERODOTUS_READER *, HERODOTUS_READER *);
void herodotus_reader_push_advance_limit(HERODOTUS_READER *, size_t);
void herodotus_reader_pop_limit(HERODOTUS_READER *);
size_t herodotus_reader_number_read(const HERODOTUS_READER *);
size_t herodotus_reader_next_word_break(const HERODOTUS_READER *);
size_t herodotus_reader_next_codepoint_break(const HERODOTUS_READER *);
enum herodotus_status herodotus_read_codepoint(HERODOTUS_READER *, bool, uint_least32_t *);
void herodotus_writer_init(HERODOTUS_WRITER *, enum herodotus_type, void *,
size_t);
void herodotus_writer_nul_terminate(HERODOTUS_WRITER *);
size_t herodotus_writer_number_written(const HERODOTUS_WRITER *);
void herodotus_write_codepoint(HERODOTUS_WRITER *, uint_least32_t);
void proper_init(const HERODOTUS_READER *, void *, uint_least8_t,
uint_least8_t (*get_break_prop)(uint_least32_t),
bool (*is_skippable_prop)(uint_least8_t),
void (*skip_shift_callback)(uint_least8_t, void *),
struct proper *);
int proper_advance(struct proper *);
#endif /* UTIL_H */

Binary file not shown.

View File

@@ -0,0 +1,268 @@
/* See LICENSE file for copyright and license details. */
#include <stdbool.h>
#include <stddef.h>
#include "../gen/word.h"
#include "../grapheme.h"
#include "util.h"
struct word_break_state
{
bool ri_even;
};
static inline uint_least8_t
get_word_break_prop(uint_least32_t cp)
{
if (likely(cp <= 0x10FFFF)) {
return (uint_least8_t)
word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
} else {
return WORD_BREAK_PROP_OTHER;
}
}
static bool
is_skippable_word_prop(uint_least8_t prop)
{
return prop == WORD_BREAK_PROP_EXTEND ||
prop == WORD_BREAK_PROP_FORMAT ||
prop == WORD_BREAK_PROP_ZWJ;
}
static void
word_skip_shift_callback(uint_least8_t prop, void *s)
{
struct word_break_state *state = (struct word_break_state *)s;
if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
/*
* The property we just shifted in is
* a regional indicator, increasing the
* number of consecutive RIs on the left
* side of the breakpoint by one, changing
* the oddness.
*
*/
state->ri_even = !(state->ri_even);
} else {
/*
* We saw no regional indicator, so the
* number of consecutive RIs on the left
* side of the breakpoint is zero, which
* is an even number.
*
*/
state->ri_even = true;
}
}
static size_t
next_word_break(HERODOTUS_READER *r)
{
struct proper p;
struct word_break_state state = { .ri_even = true };
/*
* Apply word breaking algorithm (UAX #29), see
* https://unicode.org/reports/tr29/#Word_Boundary_Rules
*/
proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
is_skippable_word_prop, word_skip_shift_callback, &p);
while (!proper_advance(&p)) {
/* WB3 */
if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
continue;
}
/* WB3a */
if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
p.raw.prev_prop[0] == WORD_BREAK_PROP_CR ||
p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
break;
}
/* WB3b */
if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
p.raw.next_prop[0] == WORD_BREAK_PROP_CR ||
p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
break;
}
/* WB3c */
if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
(p.raw.next_prop[0] == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
p.raw.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
continue;
}
/* WB3d */
if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
continue;
}
/* WB4 */
if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
continue;
}
/* WB5 */
if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
(p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
continue;
}
/* WB6 */
if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
(p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
(p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER ||
p.skip.next_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
continue;
}
/* WB7 */
if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
(p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
(p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER ||
p.skip.prev_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
continue;
}
/* WB7a */
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
continue;
}
/* WB7b */
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
continue;
}
/* WB7c */
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
continue;
}
/* WB8 */
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
continue;
}
/* WB9 */
if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
continue;
}
/* WB10 */
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
(p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
continue;
}
/* WB11 */
if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
continue;
}
/* WB12 */
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
(p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM ||
p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
continue;
}
/* WB13 */
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
continue;
}
/* WB13a */
if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA ||
p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
continue;
}
/* WB13b */
if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
(p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC ||
p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
continue;
}
/* WB15 and WB16 */
if (!state.ri_even &&
p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
continue;
}
/* WB999 */
break;
}
return herodotus_reader_number_read(&(p.mid_reader));
}
size_t
grapheme_next_word_break(const uint_least32_t *str, size_t len)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
return next_word_break(&r);
}
size_t
grapheme_next_word_break_utf8(const char *str, size_t len)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
return next_word_break(&r);
}

Binary file not shown.