Initial Commit

2025-08-30 16:07:19 +01:00
commit d86c15e30c
169 changed files with 121377 additions and 0 deletions
--- a/libs/libgrapheme-2.0.2/src/util.c
+++ b/libs/libgrapheme-2.0.2/src/util.c
@@ -0,0 +1,417 @@
+/* See LICENSE file for copyright and license details. */
+#include <limits.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "../gen/types.h"
+#include "../grapheme.h"
+#include "util.h"
+
+void
+herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type,
+                      const void *src, size_t srclen)
+{
+	size_t i;
+
+	r->type = type;
+	r->src = src;
+	r->srclen = srclen;
+	r->off = 0;
+	r->terminated_by_null = false;
+
+	for (i = 0; i < LEN(r->soft_limit); i++) {
+		r->soft_limit[i] = SIZE_MAX;
+	}
+}
+
+void
+herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
+{
+	size_t i;
+
+	/*
+	 * we copy such that we have a "fresh" start and build on the
+	 * fact that src->soft_limit[i] for any i and src->srclen are
+	 * always larger or equal to src->off
+	 */
+	dest->type = src->type;
+	if (src->type == HERODOTUS_TYPE_CODEPOINT) {
+		dest->src = (src->src == NULL) ? NULL :
+		            ((const uint_least32_t *)(src->src)) + src->off;
+	} else { /* src->type == HERODOTUS_TYPE_UTF8 */
+		dest->src = (src->src == NULL) ? NULL :
+		            ((const char *)(src->src)) + src->off;
+	}
+	if (src->srclen == SIZE_MAX) {
+		dest->srclen = SIZE_MAX;
+	} else {
+		dest->srclen = (src->off < src->srclen) ? src->srclen - src->off : 0;
+	}
+	dest->off = 0;
+	dest->terminated_by_null = src->terminated_by_null;
+
+	for (i = 0; i < LEN(src->soft_limit); i++) {
+		if (src->soft_limit[i] == SIZE_MAX) {
+			dest->soft_limit[i] = SIZE_MAX;
+		} else {
+			/*
+			 * if we have a degenerate case where the offset is
+			 * higher than the soft-limit, we simply clamp the
+			 * soft-limit to zero given we can't decide here
+			 * to release the limit and, instead, we just
+			 * prevent any more reads
+			 */
+			dest->soft_limit[i] = (src->off < src->soft_limit[i]) ?
+				src->soft_limit[i] - src->off : 0;
+		}
+	}
+}
+
+void
+herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count)
+{
+	size_t i;
+
+	for (i = LEN(r->soft_limit) - 1; i >= 1; i--) {
+		r->soft_limit[i] = r->soft_limit[i - 1];
+	}
+	r->soft_limit[0] = r->off + count;
+}
+
+void
+herodotus_reader_pop_limit(HERODOTUS_READER *r)
+{
+	size_t i;
+
+	for (i = 0; i < LEN(r->soft_limit) - 1; i++) {
+		r->soft_limit[i] = r->soft_limit[i + 1];
+	}
+	r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX;
+}
+
+size_t
+herodotus_reader_next_word_break(const HERODOTUS_READER *r)
+{
+	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
+		return grapheme_next_word_break(
+			(const uint_least32_t *)(r->src) + r->off,
+			MIN(r->srclen, r->soft_limit[0]) - r->off);
+	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
+		return grapheme_next_word_break_utf8(
+			(const char *)(r->src) + r->off,
+			MIN(r->srclen, r->soft_limit[0]) - r->off);
+	}
+}
+
+size_t
+herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r)
+{
+	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
+		return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0;
+	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
+		return grapheme_decode_utf8(
+			(const char *)(r->src) + r->off,
+			MIN(r->srclen, r->soft_limit[0]) - r->off, NULL);
+	}
+}
+
+size_t
+herodotus_reader_number_read(const HERODOTUS_READER *r)
+{
+	return r->off;
+}
+
+enum herodotus_status
+herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp)
+{
+	size_t ret;
+
+	if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) {
+		*cp = GRAPHEME_INVALID_CODEPOINT;
+		return HERODOTUS_STATUS_END_OF_BUFFER;
+	}
+
+	if (r->off >= r->soft_limit[0]) {
+		*cp = GRAPHEME_INVALID_CODEPOINT;
+		return HERODOTUS_STATUS_SOFT_LIMIT_REACHED;
+	}
+
+	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
+		*cp = ((const uint_least32_t *)(r->src))[r->off];
+		ret = 1;
+	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
+		ret = grapheme_decode_utf8((const char *)r->src + r->off,
+		                           MIN(r->srclen, r->soft_limit[0]) -
+		                           r->off, cp);
+	}
+
+	if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) {
+		/*
+		 * We encountered a null-codepoint. Don't increment
+		 * offset and return as if the buffer had ended here all
+		 * along
+		 */
+		r->terminated_by_null = true;
+		return HERODOTUS_STATUS_END_OF_BUFFER;
+	}
+
+	if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) {
+		/*
+		 * we want more than we have; instead of returning
+		 * garbage we terminate here.
+		 */
+		return HERODOTUS_STATUS_END_OF_BUFFER;
+	}
+
+	/*
+	 * Increase offset which we now know won't surpass the limits,
+	 * unless we got told otherwise
+	 */
+	if (advance) {
+		r->off += ret;
+	}
+
+	return HERODOTUS_STATUS_SUCCESS;
+}
+
+void
+herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type,
+                      void *dest, size_t destlen)
+{
+	w->type = type;
+	w->dest = dest;
+	w->destlen = destlen;
+	w->off = 0;
+	w->first_unwritable_offset = SIZE_MAX;
+}
+
+void
+herodotus_writer_nul_terminate(HERODOTUS_WRITER *w)
+{
+	if (w->dest == NULL) {
+		return;
+	}
+
+	if (w->off < w->destlen) {
+		/* We still have space in the buffer. Simply use it */
+		if (w->type == HERODOTUS_TYPE_CODEPOINT) {
+			((uint_least32_t *)(w->dest))[w->off] = 0;
+		} else { /* w->type == HERODOTUS_TYPE_UTF8 */
+			((char *)(w->dest))[w->off] = '\0';
+		}
+	} else if (w->first_unwritable_offset < w->destlen) {
+		/*
+		 * There is no more space in the buffer. However,
+		 * we have noted down the first offset we couldn't
+		 * use to write into the buffer and it's smaller than
+		 * destlen. Thus we bailed writing into the
+		 * destination when a multibyte-codepoint couldn't be
+		 * written. So the last "real" byte might be at
+		 * destlen-4, destlen-3, destlen-2 or destlen-1
+		 * (the last case meaning truncation).
+		 */
+		if (w->type == HERODOTUS_TYPE_CODEPOINT) {
+			((uint_least32_t *)(w->dest))
+				[w->first_unwritable_offset] = 0;
+		} else { /* w->type == HERODOTUS_TYPE_UTF8 */
+			((char *)(w->dest))[w->first_unwritable_offset] = '\0';
+		}
+	} else if (w->destlen > 0) {
+		/*
+		 * In this case, there is no more space in the buffer and
+		 * the last unwritable offset is larger than
+		 * or equal to the destination buffer length. This means
+		 * that we are forced to simply write into the last
+		 * byte.
+		 */
+		if (w->type == HERODOTUS_TYPE_CODEPOINT) {
+			((uint_least32_t *)(w->dest))
+				[w->destlen - 1] = 0;
+		} else { /* w->type == HERODOTUS_TYPE_UTF8 */
+			((char *)(w->dest))[w->destlen - 1] = '\0';
+		}
+	}
+
+	/* w->off is not incremented in any case */
+}
+
+size_t
+herodotus_writer_number_written(const HERODOTUS_WRITER *w)
+{
+	return w->off;
+}
+
+void
+herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
+{
+	size_t ret;
+
+	/*
+	 * This function will always faithfully say how many codepoints
+	 * were written, even if the buffer ends. This is used to enable
+	 * truncation detection.
+	 */
+	if (w->type == HERODOTUS_TYPE_CODEPOINT) {
+		if (w->dest != NULL && w->off < w->destlen) {
+			((uint_least32_t *)(w->dest))[w->off] = cp;
+		}
+
+		w->off += 1;
+	} else { /* w->type == HERODOTUS_TYPE_UTF8 */
+		/*
+		 * First determine how many bytes we need to encode the
+		 * codepoint
+		 */
+		ret = grapheme_encode_utf8(cp, NULL, 0);
+
+		if (w->dest != NULL && w->off + ret < w->destlen) {
+			/* we still have enough room in the buffer */
+			grapheme_encode_utf8(cp, (char *)(w->dest) +
+			                     w->off, w->destlen - w->off);
+		} else if (w->first_unwritable_offset == SIZE_MAX) {
+			/*
+			 * the first unwritable offset has not been
+			 * noted down, so this is the first time we can't
+			 * write (completely) to an offset
+			 */
+			w->first_unwritable_offset = w->off;
+		}
+
+		w->off += ret;
+	}
+}
+
+void
+proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
+            uint_least8_t (*get_break_prop)(uint_least32_t),
+            bool (*is_skippable_prop)(uint_least8_t),
+            void (*skip_shift_callback)(uint_least8_t, void *),
+            struct proper *p)
+{
+	uint_least8_t prop;
+	uint_least32_t cp;
+	size_t i;
+
+	/* set internal variables */
+	p->state = state;
+	p->no_prop = no_prop;
+	p->get_break_prop = get_break_prop;
+	p->is_skippable_prop = is_skippable_prop;
+	p->skip_shift_callback = skip_shift_callback;
+
+	/*
+	 * Initialize mid-reader, which is basically just there
+	 * to reflect the current position of the viewing-line
+	 */
+	herodotus_reader_copy(r, &(p->mid_reader));
+
+	/*
+	 * In the initialization, we simply (try to) fill in next_prop.
+	 * If we cannot read in more (due to the buffer ending), we
+	 * fill in the prop as invalid
+	 */
+
+	/*
+	 * initialize the previous properties to have no property
+	 * (given we are at the start of the buffer)
+	 */
+	p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
+	p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
+
+	/*
+	 * initialize the next properties
+	 */
+
+	/* initialize the raw reader */
+	herodotus_reader_copy(r, &(p->raw_reader));
+
+	/* fill in the two next raw properties (after no-initialization) */
+	p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
+	for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
+	     HERODOTUS_STATUS_SUCCESS; ) {
+		p->raw.next_prop[i++] = p->get_break_prop(cp);
+	}
+
+	/* initialize the skip reader */
+	herodotus_reader_copy(r, &(p->skip_reader));
+
+	/* fill in the two next skip properties (after no-initialization) */
+	p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
+	for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
+	     HERODOTUS_STATUS_SUCCESS; ) {
+		prop = p->get_break_prop(cp);
+		if (!p->is_skippable_prop(prop)) {
+			p->skip.next_prop[i++] = prop;
+		}
+	}
+}
+
+int
+proper_advance(struct proper *p)
+{
+	uint_least8_t prop;
+	uint_least32_t cp;
+
+	/* read in next "raw" property */
+	if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
+	    HERODOTUS_STATUS_SUCCESS) {
+		prop = p->get_break_prop(cp);
+	} else {
+		prop = p->no_prop;
+	}
+
+	/*
+	 * do a shift-in, unless we find that the property that is to
+	 * be moved past the "raw-viewing-line" (this property is stored
+	 * in p->raw.next_prop[0]) is a no_prop, indicating that
+	 * we are at the end of the buffer.
+	 */
+	if (p->raw.next_prop[0] == p->no_prop) {
+		return 1;
+	}
+
+	/* shift in the properties */
+	p->raw.prev_prop[1] = p->raw.prev_prop[0];
+	p->raw.prev_prop[0] = p->raw.next_prop[0];
+	p->raw.next_prop[0] = p->raw.next_prop[1];
+	p->raw.next_prop[1] = prop;
+
+	/* advance the middle reader viewing-line */
+	(void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
+
+	/* check skippability-property */
+	if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
+		/*
+		 * the property that has moved past the "raw-viewing-line"
+		 * (this property is now (after the raw-shift) stored in
+		 * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
+		 * guaranteeing that we won't shift a no-prop past the
+		 * "viewing-line" in the skip-properties) is not a skippable
+		 * property, thus we need to shift the skip property as well.
+		 */
+		p->skip.prev_prop[1] = p->skip.prev_prop[0];
+		p->skip.prev_prop[0] = p->skip.next_prop[0];
+		p->skip.next_prop[0] = p->skip.next_prop[1];
+
+		/*
+		 * call the skip-shift-callback on the property that
+		 * passed the skip-viewing-line (this property is now
+		 * stored in p->skip.prev_prop[0]).
+		 */
+		p->skip_shift_callback(p->skip.prev_prop[0], p->state);
+
+		/* determine the next shift property */
+		p->skip.next_prop[1] = p->no_prop;
+		while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
+		       HERODOTUS_STATUS_SUCCESS) {
+			prop = p->get_break_prop(cp);
+			if (!p->is_skippable_prop(prop)) {
+				p->skip.next_prop[1] = prop;
+				break;
+			}
+		}
+	}
+
+	return 0;
+}