Initial Commit
This commit is contained in:
417
libs/libgrapheme-2.0.2/src/util.c
Normal file
417
libs/libgrapheme-2.0.2/src/util.c
Normal file
@@ -0,0 +1,417 @@
|
||||
/* See LICENSE file for copyright and license details. */
|
||||
#include <limits.h>
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "../gen/types.h"
|
||||
#include "../grapheme.h"
|
||||
#include "util.h"
|
||||
|
||||
void
|
||||
herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type,
|
||||
const void *src, size_t srclen)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
r->type = type;
|
||||
r->src = src;
|
||||
r->srclen = srclen;
|
||||
r->off = 0;
|
||||
r->terminated_by_null = false;
|
||||
|
||||
for (i = 0; i < LEN(r->soft_limit); i++) {
|
||||
r->soft_limit[i] = SIZE_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
/*
|
||||
* we copy such that we have a "fresh" start and build on the
|
||||
* fact that src->soft_limit[i] for any i and src->srclen are
|
||||
* always larger or equal to src->off
|
||||
*/
|
||||
dest->type = src->type;
|
||||
if (src->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
dest->src = (src->src == NULL) ? NULL :
|
||||
((const uint_least32_t *)(src->src)) + src->off;
|
||||
} else { /* src->type == HERODOTUS_TYPE_UTF8 */
|
||||
dest->src = (src->src == NULL) ? NULL :
|
||||
((const char *)(src->src)) + src->off;
|
||||
}
|
||||
if (src->srclen == SIZE_MAX) {
|
||||
dest->srclen = SIZE_MAX;
|
||||
} else {
|
||||
dest->srclen = (src->off < src->srclen) ? src->srclen - src->off : 0;
|
||||
}
|
||||
dest->off = 0;
|
||||
dest->terminated_by_null = src->terminated_by_null;
|
||||
|
||||
for (i = 0; i < LEN(src->soft_limit); i++) {
|
||||
if (src->soft_limit[i] == SIZE_MAX) {
|
||||
dest->soft_limit[i] = SIZE_MAX;
|
||||
} else {
|
||||
/*
|
||||
* if we have a degenerate case where the offset is
|
||||
* higher than the soft-limit, we simply clamp the
|
||||
* soft-limit to zero given we can't decide here
|
||||
* to release the limit and, instead, we just
|
||||
* prevent any more reads
|
||||
*/
|
||||
dest->soft_limit[i] = (src->off < src->soft_limit[i]) ?
|
||||
src->soft_limit[i] - src->off : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
for (i = LEN(r->soft_limit) - 1; i >= 1; i--) {
|
||||
r->soft_limit[i] = r->soft_limit[i - 1];
|
||||
}
|
||||
r->soft_limit[0] = r->off + count;
|
||||
}
|
||||
|
||||
void
|
||||
herodotus_reader_pop_limit(HERODOTUS_READER *r)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < LEN(r->soft_limit) - 1; i++) {
|
||||
r->soft_limit[i] = r->soft_limit[i + 1];
|
||||
}
|
||||
r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX;
|
||||
}
|
||||
|
||||
size_t
|
||||
herodotus_reader_next_word_break(const HERODOTUS_READER *r)
|
||||
{
|
||||
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
return grapheme_next_word_break(
|
||||
(const uint_least32_t *)(r->src) + r->off,
|
||||
MIN(r->srclen, r->soft_limit[0]) - r->off);
|
||||
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
|
||||
return grapheme_next_word_break_utf8(
|
||||
(const char *)(r->src) + r->off,
|
||||
MIN(r->srclen, r->soft_limit[0]) - r->off);
|
||||
}
|
||||
}
|
||||
|
||||
size_t
|
||||
herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r)
|
||||
{
|
||||
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0;
|
||||
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
|
||||
return grapheme_decode_utf8(
|
||||
(const char *)(r->src) + r->off,
|
||||
MIN(r->srclen, r->soft_limit[0]) - r->off, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
size_t
|
||||
herodotus_reader_number_read(const HERODOTUS_READER *r)
|
||||
{
|
||||
return r->off;
|
||||
}
|
||||
|
||||
enum herodotus_status
|
||||
herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp)
|
||||
{
|
||||
size_t ret;
|
||||
|
||||
if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) {
|
||||
*cp = GRAPHEME_INVALID_CODEPOINT;
|
||||
return HERODOTUS_STATUS_END_OF_BUFFER;
|
||||
}
|
||||
|
||||
if (r->off >= r->soft_limit[0]) {
|
||||
*cp = GRAPHEME_INVALID_CODEPOINT;
|
||||
return HERODOTUS_STATUS_SOFT_LIMIT_REACHED;
|
||||
}
|
||||
|
||||
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
*cp = ((const uint_least32_t *)(r->src))[r->off];
|
||||
ret = 1;
|
||||
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
|
||||
ret = grapheme_decode_utf8((const char *)r->src + r->off,
|
||||
MIN(r->srclen, r->soft_limit[0]) -
|
||||
r->off, cp);
|
||||
}
|
||||
|
||||
if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) {
|
||||
/*
|
||||
* We encountered a null-codepoint. Don't increment
|
||||
* offset and return as if the buffer had ended here all
|
||||
* along
|
||||
*/
|
||||
r->terminated_by_null = true;
|
||||
return HERODOTUS_STATUS_END_OF_BUFFER;
|
||||
}
|
||||
|
||||
if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) {
|
||||
/*
|
||||
* we want more than we have; instead of returning
|
||||
* garbage we terminate here.
|
||||
*/
|
||||
return HERODOTUS_STATUS_END_OF_BUFFER;
|
||||
}
|
||||
|
||||
/*
|
||||
* Increase offset which we now know won't surpass the limits,
|
||||
* unless we got told otherwise
|
||||
*/
|
||||
if (advance) {
|
||||
r->off += ret;
|
||||
}
|
||||
|
||||
return HERODOTUS_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void
|
||||
herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type,
|
||||
void *dest, size_t destlen)
|
||||
{
|
||||
w->type = type;
|
||||
w->dest = dest;
|
||||
w->destlen = destlen;
|
||||
w->off = 0;
|
||||
w->first_unwritable_offset = SIZE_MAX;
|
||||
}
|
||||
|
||||
void
|
||||
herodotus_writer_nul_terminate(HERODOTUS_WRITER *w)
|
||||
{
|
||||
if (w->dest == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (w->off < w->destlen) {
|
||||
/* We still have space in the buffer. Simply use it */
|
||||
if (w->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
((uint_least32_t *)(w->dest))[w->off] = 0;
|
||||
} else { /* w->type == HERODOTUS_TYPE_UTF8 */
|
||||
((char *)(w->dest))[w->off] = '\0';
|
||||
}
|
||||
} else if (w->first_unwritable_offset < w->destlen) {
|
||||
/*
|
||||
* There is no more space in the buffer. However,
|
||||
* we have noted down the first offset we couldn't
|
||||
* use to write into the buffer and it's smaller than
|
||||
* destlen. Thus we bailed writing into the
|
||||
* destination when a multibyte-codepoint couldn't be
|
||||
* written. So the last "real" byte might be at
|
||||
* destlen-4, destlen-3, destlen-2 or destlen-1
|
||||
* (the last case meaning truncation).
|
||||
*/
|
||||
if (w->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
((uint_least32_t *)(w->dest))
|
||||
[w->first_unwritable_offset] = 0;
|
||||
} else { /* w->type == HERODOTUS_TYPE_UTF8 */
|
||||
((char *)(w->dest))[w->first_unwritable_offset] = '\0';
|
||||
}
|
||||
} else if (w->destlen > 0) {
|
||||
/*
|
||||
* In this case, there is no more space in the buffer and
|
||||
* the last unwritable offset is larger than
|
||||
* or equal to the destination buffer length. This means
|
||||
* that we are forced to simply write into the last
|
||||
* byte.
|
||||
*/
|
||||
if (w->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
((uint_least32_t *)(w->dest))
|
||||
[w->destlen - 1] = 0;
|
||||
} else { /* w->type == HERODOTUS_TYPE_UTF8 */
|
||||
((char *)(w->dest))[w->destlen - 1] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
/* w->off is not incremented in any case */
|
||||
}
|
||||
|
||||
size_t
|
||||
herodotus_writer_number_written(const HERODOTUS_WRITER *w)
|
||||
{
|
||||
return w->off;
|
||||
}
|
||||
|
||||
void
|
||||
herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
|
||||
{
|
||||
size_t ret;
|
||||
|
||||
/*
|
||||
* This function will always faithfully say how many codepoints
|
||||
* were written, even if the buffer ends. This is used to enable
|
||||
* truncation detection.
|
||||
*/
|
||||
if (w->type == HERODOTUS_TYPE_CODEPOINT) {
|
||||
if (w->dest != NULL && w->off < w->destlen) {
|
||||
((uint_least32_t *)(w->dest))[w->off] = cp;
|
||||
}
|
||||
|
||||
w->off += 1;
|
||||
} else { /* w->type == HERODOTUS_TYPE_UTF8 */
|
||||
/*
|
||||
* First determine how many bytes we need to encode the
|
||||
* codepoint
|
||||
*/
|
||||
ret = grapheme_encode_utf8(cp, NULL, 0);
|
||||
|
||||
if (w->dest != NULL && w->off + ret < w->destlen) {
|
||||
/* we still have enough room in the buffer */
|
||||
grapheme_encode_utf8(cp, (char *)(w->dest) +
|
||||
w->off, w->destlen - w->off);
|
||||
} else if (w->first_unwritable_offset == SIZE_MAX) {
|
||||
/*
|
||||
* the first unwritable offset has not been
|
||||
* noted down, so this is the first time we can't
|
||||
* write (completely) to an offset
|
||||
*/
|
||||
w->first_unwritable_offset = w->off;
|
||||
}
|
||||
|
||||
w->off += ret;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
|
||||
uint_least8_t (*get_break_prop)(uint_least32_t),
|
||||
bool (*is_skippable_prop)(uint_least8_t),
|
||||
void (*skip_shift_callback)(uint_least8_t, void *),
|
||||
struct proper *p)
|
||||
{
|
||||
uint_least8_t prop;
|
||||
uint_least32_t cp;
|
||||
size_t i;
|
||||
|
||||
/* set internal variables */
|
||||
p->state = state;
|
||||
p->no_prop = no_prop;
|
||||
p->get_break_prop = get_break_prop;
|
||||
p->is_skippable_prop = is_skippable_prop;
|
||||
p->skip_shift_callback = skip_shift_callback;
|
||||
|
||||
/*
|
||||
* Initialize mid-reader, which is basically just there
|
||||
* to reflect the current position of the viewing-line
|
||||
*/
|
||||
herodotus_reader_copy(r, &(p->mid_reader));
|
||||
|
||||
/*
|
||||
* In the initialization, we simply (try to) fill in next_prop.
|
||||
* If we cannot read in more (due to the buffer ending), we
|
||||
* fill in the prop as invalid
|
||||
*/
|
||||
|
||||
/*
|
||||
* initialize the previous properties to have no property
|
||||
* (given we are at the start of the buffer)
|
||||
*/
|
||||
p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
|
||||
p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
|
||||
|
||||
/*
|
||||
* initialize the next properties
|
||||
*/
|
||||
|
||||
/* initialize the raw reader */
|
||||
herodotus_reader_copy(r, &(p->raw_reader));
|
||||
|
||||
/* fill in the two next raw properties (after no-initialization) */
|
||||
p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
|
||||
for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
|
||||
HERODOTUS_STATUS_SUCCESS; ) {
|
||||
p->raw.next_prop[i++] = p->get_break_prop(cp);
|
||||
}
|
||||
|
||||
/* initialize the skip reader */
|
||||
herodotus_reader_copy(r, &(p->skip_reader));
|
||||
|
||||
/* fill in the two next skip properties (after no-initialization) */
|
||||
p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
|
||||
for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
|
||||
HERODOTUS_STATUS_SUCCESS; ) {
|
||||
prop = p->get_break_prop(cp);
|
||||
if (!p->is_skippable_prop(prop)) {
|
||||
p->skip.next_prop[i++] = prop;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
proper_advance(struct proper *p)
|
||||
{
|
||||
uint_least8_t prop;
|
||||
uint_least32_t cp;
|
||||
|
||||
/* read in next "raw" property */
|
||||
if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
|
||||
HERODOTUS_STATUS_SUCCESS) {
|
||||
prop = p->get_break_prop(cp);
|
||||
} else {
|
||||
prop = p->no_prop;
|
||||
}
|
||||
|
||||
/*
|
||||
* do a shift-in, unless we find that the property that is to
|
||||
* be moved past the "raw-viewing-line" (this property is stored
|
||||
* in p->raw.next_prop[0]) is a no_prop, indicating that
|
||||
* we are at the end of the buffer.
|
||||
*/
|
||||
if (p->raw.next_prop[0] == p->no_prop) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* shift in the properties */
|
||||
p->raw.prev_prop[1] = p->raw.prev_prop[0];
|
||||
p->raw.prev_prop[0] = p->raw.next_prop[0];
|
||||
p->raw.next_prop[0] = p->raw.next_prop[1];
|
||||
p->raw.next_prop[1] = prop;
|
||||
|
||||
/* advance the middle reader viewing-line */
|
||||
(void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
|
||||
|
||||
/* check skippability-property */
|
||||
if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
|
||||
/*
|
||||
* the property that has moved past the "raw-viewing-line"
|
||||
* (this property is now (after the raw-shift) stored in
|
||||
* p->raw.prev_prop[0] and guaranteed not to be a no-prop,
|
||||
* guaranteeing that we won't shift a no-prop past the
|
||||
* "viewing-line" in the skip-properties) is not a skippable
|
||||
* property, thus we need to shift the skip property as well.
|
||||
*/
|
||||
p->skip.prev_prop[1] = p->skip.prev_prop[0];
|
||||
p->skip.prev_prop[0] = p->skip.next_prop[0];
|
||||
p->skip.next_prop[0] = p->skip.next_prop[1];
|
||||
|
||||
/*
|
||||
* call the skip-shift-callback on the property that
|
||||
* passed the skip-viewing-line (this property is now
|
||||
* stored in p->skip.prev_prop[0]).
|
||||
*/
|
||||
p->skip_shift_callback(p->skip.prev_prop[0], p->state);
|
||||
|
||||
/* determine the next shift property */
|
||||
p->skip.next_prop[1] = p->no_prop;
|
||||
while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
|
||||
HERODOTUS_STATUS_SUCCESS) {
|
||||
prop = p->get_break_prop(cp);
|
||||
if (!p->is_skippable_prop(prop)) {
|
||||
p->skip.next_prop[1] = prop;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Reference in New Issue
Block a user