Initial Commit

This commit is contained in:
2025-08-30 16:07:19 +01:00
commit d86c15e30c
169 changed files with 121377 additions and 0 deletions

View File

@@ -0,0 +1,55 @@
/* See LICENSE file for copyright and license details. */
#include <errno.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../grapheme.h"
#include "../gen/word-test.h"
#include "util.h"
#define NUM_ITERATIONS 10000
struct break_benchmark_payload {
uint_least32_t *src;
size_t srclen;
uint_least32_t *dest;
size_t destlen;
};
void
libgrapheme(const void *payload)
{
const struct break_benchmark_payload *p = payload;
grapheme_to_uppercase(p->src, p->srclen, p->dest, p->destlen);
}
int
main(int argc, char *argv[])
{
struct break_benchmark_payload p;
double baseline = (double)NAN;
(void)argc;
if ((p.src = generate_cp_test_buffer(word_break_test,
LEN(word_break_test),
&(p.srclen))) == NULL) {
return 1;
}
if ((p.dest = calloc((p.destlen = 2 * p.srclen), sizeof(*(p.dest)))) == NULL) {
fprintf(stderr, "calloc: Out of memory\n");
}
printf("%s\n", argv[0]);
run_benchmark(libgrapheme, &p, "libgrapheme ", NULL, "codepoint",
&baseline, NUM_ITERATIONS, p.srclen - 1);
free(p.src);
free(p.dest);
return 0;
}

View File

@@ -0,0 +1,86 @@
/* See LICENSE file for copyright and license details. */
#include <errno.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../grapheme.h"
#include "../gen/character-test.h"
#include "util.h"
#include <utf8proc.h>
#define NUM_ITERATIONS 100000
struct break_benchmark_payload {
uint_least32_t *buf;
utf8proc_int32_t *buf_utf8proc;
size_t buflen;
};
void
libgrapheme(const void *payload)
{
uint_least16_t state = 0;
const struct break_benchmark_payload *p = payload;
size_t i;
for (i = 0; i + 1 < p->buflen; i++) {
(void)grapheme_is_character_break(p->buf[i], p->buf[i+1],
&state);
}
}
void
libutf8proc(const void *payload)
{
utf8proc_int32_t state = 0;
const struct break_benchmark_payload *p = payload;
size_t i;
for (i = 0; i + 1 < p->buflen; i++) {
(void)utf8proc_grapheme_break_stateful(p->buf_utf8proc[i],
p->buf_utf8proc[i+1],
&state);
}
}
int
main(int argc, char *argv[])
{
struct break_benchmark_payload p;
double baseline = (double)NAN;
size_t i;
(void)argc;
if ((p.buf = generate_cp_test_buffer(character_break_test,
LEN(character_break_test),
&(p.buflen))) == NULL) {
return 1;
}
if ((p.buf_utf8proc = malloc(p.buflen * sizeof(*(p.buf_utf8proc)))) == NULL) {
fprintf(stderr, "malloc: %s\n", strerror(errno));
exit(1);
}
for (i = 0; i < p.buflen; i++) {
/*
* there is no overflow, as we know that the maximum
* codepoint is 0x10FFFF, which is way below 2^31
*/
p.buf_utf8proc[i] = (utf8proc_int32_t)p.buf[i];
}
printf("%s\n", argv[0]);
run_benchmark(libgrapheme, &p, "libgrapheme ", NULL, "comparison",
&baseline, NUM_ITERATIONS, p.buflen - 1);
run_benchmark(libutf8proc, &p, "libutf8proc ", NULL, "comparison",
&baseline, NUM_ITERATIONS, p.buflen - 1);
free(p.buf);
free(p.buf_utf8proc);
return 0;
}

View File

@@ -0,0 +1,52 @@
/* See LICENSE file for copyright and license details. */
#include <errno.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../grapheme.h"
#include "../gen/line-test.h"
#include "util.h"
#define NUM_ITERATIONS 10000
struct break_benchmark_payload {
uint_least32_t *buf;
size_t buflen;
};
void
libgrapheme(const void *payload)
{
const struct break_benchmark_payload *p = payload;
size_t off;
for (off = 0; off < p->buflen; ) {
off += grapheme_next_line_break(p->buf + off, p->buflen - off);
}
}
int
main(int argc, char *argv[])
{
struct break_benchmark_payload p;
double baseline = (double)NAN;
(void)argc;
if ((p.buf = generate_cp_test_buffer(line_break_test,
LEN(line_break_test),
&(p.buflen))) == NULL) {
return 1;
}
printf("%s\n", argv[0]);
run_benchmark(libgrapheme, &p, "libgrapheme ", NULL, "codepoint",
&baseline, NUM_ITERATIONS, p.buflen - 1);
free(p.buf);
return 0;
}

View File

@@ -0,0 +1,52 @@
/* See LICENSE file for copyright and license details. */
#include <errno.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../grapheme.h"
#include "../gen/sentence-test.h"
#include "util.h"
#define NUM_ITERATIONS 100000
struct break_benchmark_payload {
uint_least32_t *buf;
size_t buflen;
};
void
libgrapheme(const void *payload)
{
const struct break_benchmark_payload *p = payload;
size_t off;
for (off = 0; off < p->buflen; ) {
off += grapheme_next_sentence_break(p->buf + off, p->buflen - off);
}
}
int
main(int argc, char *argv[])
{
struct break_benchmark_payload p;
double baseline = (double)NAN;
(void)argc;
if ((p.buf = generate_cp_test_buffer(sentence_break_test,
LEN(sentence_break_test),
&(p.buflen))) == NULL) {
return 1;
}
printf("%s\n", argv[0]);
run_benchmark(libgrapheme, &p, "libgrapheme ", NULL, "codepoint",
&baseline, NUM_ITERATIONS, p.buflen - 1);
free(p.buf);
return 0;
}

View File

@@ -0,0 +1,95 @@
/* See LICENSE file for copyright and license details. */
#include <errno.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../grapheme.h"
#include "../gen/character-test.h"
#include "util.h"
#include <utf8proc.h>
#define NUM_ITERATIONS 100000
struct utf8_benchmark_payload {
char *buf;
utf8proc_uint8_t *buf_utf8proc;
size_t buflen;
};
void
libgrapheme(const void *payload)
{
const struct utf8_benchmark_payload *p = payload;
uint_least32_t cp;
size_t ret, off;
for (off = 0; off < p->buflen; off += ret) {
if ((ret = grapheme_decode_utf8(p->buf + off,
p->buflen - off, &cp)) >
(p->buflen - off)) {
break;
}
(void)cp;
}
}
void
libutf8proc(const void *payload)
{
const struct utf8_benchmark_payload *p = payload;
utf8proc_int32_t cp;
utf8proc_ssize_t ret;
size_t off;
for (off = 0; off < p->buflen; off += (size_t)ret) {
if ((ret = utf8proc_iterate(p->buf_utf8proc + off,
(utf8proc_ssize_t)(p->buflen - off),
&cp)) < 0) {
break;
}
(void)cp;
}
}
int
main(int argc, char *argv[])
{
struct utf8_benchmark_payload p;
size_t i;
double baseline = (double)NAN;
(void)argc;
p.buf = generate_utf8_test_buffer(character_break_test,
LEN(character_break_test),
&(p.buflen));
/* convert cp-buffer to stupid custom libutf8proc-uint8-type */
if ((p.buf_utf8proc = malloc(p.buflen)) == NULL) {
fprintf(stderr, "malloc: %s\n", strerror(errno));
exit(1);
}
for (i = 0; i < p.buflen; i++) {
/*
* even if char is larger than 8 bit, it will only have
* any of the first 8 bits set (by construction).
*/
p.buf_utf8proc[i] = (utf8proc_uint8_t)p.buf[i];
}
printf("%s\n", argv[0]);
run_benchmark(libgrapheme, &p, "libgrapheme ", NULL,
"byte", &baseline, NUM_ITERATIONS, p.buflen);
run_benchmark(libutf8proc, &p, "libutf8proc ",
"but unsafe (does not detect overlong encodings)",
"byte", &baseline, NUM_ITERATIONS, p.buflen);
free(p.buf);
free(p.buf_utf8proc);
return 0;
}

View File

@@ -0,0 +1,115 @@
/* See LICENSE file for copyright and license details. */
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include "../gen/types.h"
#include "../grapheme.h"
#include "util.h"
uint_least32_t *
generate_cp_test_buffer(const struct break_test *test, size_t testlen,
size_t *buflen)
{
size_t i, j, off;
uint_least32_t *buf;
/* allocate and generate buffer */
for (i = 0, *buflen = 0; i < testlen; i++) {
*buflen += test[i].cplen;
}
if (!(buf = calloc(*buflen, sizeof(*buf)))) {
fprintf(stderr, "generate_test_buffer: calloc: Out of memory.\n");
exit(1);
}
for (i = 0, off = 0; i < testlen; i++) {
for (j = 0; j < test[i].cplen; j++) {
buf[off + j] = test[i].cp[j];
}
off += test[i].cplen;
}
return buf;
}
char *
generate_utf8_test_buffer(const struct break_test *test, size_t testlen,
size_t *buflen)
{
size_t i, j, off, ret;
char *buf;
/* allocate and generate buffer */
for (i = 0, *buflen = 0; i < testlen; i++) {
for (j = 0; j < test[i].cplen; j++) {
*buflen += grapheme_encode_utf8(test[i].cp[j], NULL, 0);
}
}
(*buflen)++; /* terminating NUL-byte */
if (!(buf = malloc(*buflen))) {
fprintf(stderr, "generate_test_buffer: malloc: Out of memory.\n");
exit(1);
}
for (i = 0, off = 0; i < testlen; i++) {
for (j = 0; j < test[i].cplen; j++, off += ret) {
if ((ret = grapheme_encode_utf8(test[i].cp[j],
buf + off,
*buflen - off)) >
(*buflen - off)) {
/* shouldn't happen */
fprintf(stderr, "generate_utf8_test_buffer: "
"Buffer too small.\n");
exit(1);
}
}
}
buf[*buflen - 1] = '\0';
return buf;
}
static double
time_diff(struct timespec *a, struct timespec *b)
{
return (double)(b->tv_sec - a->tv_sec) +
(double)(b->tv_nsec - a->tv_nsec) * 1E-9;
}
void
run_benchmark(void (*func)(const void *), const void *payload,
const char *name, const char *comment, const char *unit,
double *baseline, size_t num_iterations,
size_t units_per_iteration)
{
struct timespec start, end;
size_t i;
double diff;
printf("\t%s ", name);
fflush(stdout);
clock_gettime(CLOCK_MONOTONIC, &start);
for (i = 0; i < num_iterations; i++) {
func(payload);
if (i % (num_iterations / 10) == 0) {
printf(".");
fflush(stdout);
}
}
clock_gettime(CLOCK_MONOTONIC, &end);
diff = time_diff(&start, &end) / (double)num_iterations /
(double)units_per_iteration;
if (isnan(*baseline)) {
*baseline = diff;
printf(" avg. %.3es/%s (baseline)\n", diff, unit);
} else {
printf(" avg. %.3es/%s (%.2f%% %s%s%s)\n", diff, unit,
fabs(1.0 - diff / *baseline) * 100,
(diff < *baseline) ? "faster" : "slower",
comment ? ", " : "",
comment ? comment : "");
}
}

View File

@@ -0,0 +1,23 @@
/* See LICENSE file for copyright and license details. */
#ifndef UTIL_H
#define UTIL_H
#include "../gen/types.h"
#define LEN(x) (sizeof(x) / sizeof(*(x)))
#ifdef __has_attribute
#if __has_attribute(optnone)
void libgrapheme(const void *) __attribute__((optnone));
void libutf8proc(const void *) __attribute__((optnone));
#endif
#endif
uint_least32_t *generate_cp_test_buffer(const struct break_test *, size_t,
size_t *);
char *generate_utf8_test_buffer(const struct break_test *, size_t, size_t *);
void run_benchmark(void (*func)(const void *), const void *, const char *,
const char *, const char *, double *, size_t, size_t);
#endif /* UTIL_H */

View File

@@ -0,0 +1,52 @@
/* See LICENSE file for copyright and license details. */
#include <errno.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../grapheme.h"
#include "../gen/word-test.h"
#include "util.h"
#define NUM_ITERATIONS 10000
struct break_benchmark_payload {
uint_least32_t *buf;
size_t buflen;
};
void
libgrapheme(const void *payload)
{
const struct break_benchmark_payload *p = payload;
size_t off;
for (off = 0; off < p->buflen; ) {
off += grapheme_next_word_break(p->buf + off, p->buflen - off);
}
}
int
main(int argc, char *argv[])
{
struct break_benchmark_payload p;
double baseline = (double)NAN;
(void)argc;
if ((p.buf = generate_cp_test_buffer(word_break_test,
LEN(word_break_test),
&(p.buflen))) == NULL) {
return 1;
}
printf("%s\n", argv[0]);
run_benchmark(libgrapheme, &p, "libgrapheme ", NULL, "codepoint",
&baseline, NUM_ITERATIONS, p.buflen - 1);
free(p.buf);
return 0;
}