Initial Commit

This commit is contained in:
2025-08-30 16:07:19 +01:00
commit d86c15e30c
169 changed files with 121377 additions and 0 deletions

View File

@@ -0,0 +1,100 @@
.Dd 2022-10-06
.Dt GRAPHEME_DECODE_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_decode_utf8
.Nd decode first codepoint in UTF-8-encoded string
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_decode_utf8 "const char *str" "size_t len" "uint_least32_t *cp"
.Sh DESCRIPTION
The
.Fn grapheme_decode_utf8
function decodes the first codepoint in the UTF-8-encoded string
.Va str
of length
.Va len .
If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
string ends unexpectedly, empty string, etc.) the decoding is stopped
at the last processed byte and the decoded codepoint set to
.Dv GRAPHEME_INVALID_CODEPOINT .
.Pp
If
.Va cp
is not
.Dv NULL
the decoded codepoint is stored in the memory pointed to by
.Va cp .
.Pp
Given NUL has a unique 1 byte representation, it is safe to operate on
NUL-terminated strings by setting
.Va len
to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) and terminating when
.Va cp
is 0 (see
.Sx EXAMPLES
for an example).
.Sh RETURN VALUES
The
.Fn grapheme_decode_utf8
function returns the number of processed bytes and 0 if
.Va str
is
.Dv NULL
or
.Va len
is 0.
If the string ends unexpectedly in a multibyte sequence, the desired
length (that is larger than
.Va len )
is returned.
.Sh EXAMPLES
.Bd -literal
/* cc (-static) -o example example.c -lgrapheme */
#include <grapheme.h>
#include <inttypes.h>
#include <stdio.h>
void
print_cps(const char *str, size_t len)
{
size_t ret, off;
uint_least32_t cp;
for (off = 0; off < len; off += ret) {
if ((ret = grapheme_decode_utf8(str + off,
len - off, &cp)) > (len - off)) {
/*
* string ended unexpectedly in the middle of a
* multibyte sequence and we have the choice
* here to possibly expand str by ret - len + off
* bytes to get a full sequence, but we just
* bail out in this case.
*/
break;
}
printf("%"PRIxLEAST32"\\n", cp);
}
}
void
print_cps_nul_terminated(const char *str)
{
size_t ret, off;
uint_least32_t cp;
for (off = 0; (ret = grapheme_decode_utf8(str + off,
SIZE_MAX, &cp)) > 0 &&
cp != 0; off += ret) {
printf("%"PRIxLEAST32"\\n", cp);
}
}
.Ed
.Sh SEE ALSO
.Xr grapheme_encode_utf8 3 ,
.Xr libgrapheme 7
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,102 @@
cat << EOF
.Dd ${MAN_DATE}
.Dt GRAPHEME_DECODE_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_decode_utf8
.Nd decode first codepoint in UTF-8-encoded string
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_decode_utf8 "const char *str" "size_t len" "uint_least32_t *cp"
.Sh DESCRIPTION
The
.Fn grapheme_decode_utf8
function decodes the first codepoint in the UTF-8-encoded string
.Va str
of length
.Va len .
If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
string ends unexpectedly, empty string, etc.) the decoding is stopped
at the last processed byte and the decoded codepoint set to
.Dv GRAPHEME_INVALID_CODEPOINT .
.Pp
If
.Va cp
is not
.Dv NULL
the decoded codepoint is stored in the memory pointed to by
.Va cp .
.Pp
Given NUL has a unique 1 byte representation, it is safe to operate on
NUL-terminated strings by setting
.Va len
to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) and terminating when
.Va cp
is 0 (see
.Sx EXAMPLES
for an example).
.Sh RETURN VALUES
The
.Fn grapheme_decode_utf8
function returns the number of processed bytes and 0 if
.Va str
is
.Dv NULL
or
.Va len
is 0.
If the string ends unexpectedly in a multibyte sequence, the desired
length (that is larger than
.Va len )
is returned.
.Sh EXAMPLES
.Bd -literal
/* cc (-static) -o example example.c -lgrapheme */
#include <grapheme.h>
#include <inttypes.h>
#include <stdio.h>
void
print_cps(const char *str, size_t len)
{
size_t ret, off;
uint_least32_t cp;
for (off = 0; off < len; off += ret) {
if ((ret = grapheme_decode_utf8(str + off,
len - off, &cp)) > (len - off)) {
/*
* string ended unexpectedly in the middle of a
* multibyte sequence and we have the choice
* here to possibly expand str by ret - len + off
* bytes to get a full sequence, but we just
* bail out in this case.
*/
break;
}
printf("%"PRIxLEAST32"\\\\n", cp);
}
}
void
print_cps_nul_terminated(const char *str)
{
size_t ret, off;
uint_least32_t cp;
for (off = 0; (ret = grapheme_decode_utf8(str + off,
SIZE_MAX, &cp)) > 0 &&
cp != 0; off += ret) {
printf("%"PRIxLEAST32"\\\\n", cp);
}
}
.Ed
.Sh SEE ALSO
.Xr grapheme_encode_utf8 3 ,
.Xr libgrapheme 7
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de
EOF

View File

@@ -0,0 +1,101 @@
.Dd 2022-10-06
.Dt GRAPHEME_ENCODE_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_encode_utf8
.Nd encode codepoint into UTF-8 string
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_encode_utf8 "uint_least32_t cp" "char *str" "size_t len"
.Sh DESCRIPTION
The
.Fn grapheme_encode_utf8
function encodes the codepoint
.Va cp
into a UTF-8-string.
If
.Va str
is not
.Dv NULL
and
.Va len
is large enough it writes the UTF-8-string to the memory pointed to by
.Va str .
Otherwise no data is written.
.Sh RETURN VALUES
The
.Fn grapheme_encode_utf8
function returns the length (in bytes) of the UTF-8-string resulting
from encoding
.Va cp ,
even if
.Va len
is not large enough or
.Va str
is
.Dv NULL .
.Sh EXAMPLES
.Bd -literal
/* cc (-static) -o example example.c -lgrapheme */
#include <grapheme.h>
#include <stddef.h>
#include <stdlib.h>
size_t
cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len)
{
size_t i, off, ret;
for (i = 0, off = 0; i < cplen; i++, off += ret) {
if ((ret = grapheme_encode_utf8(cp[i], str + off,
len - off)) > (len - off)) {
/* buffer too small */
break;
}
}
return off;
}
size_t
cps_bytelen(const uint_least32_t *cp, size_t cplen)
{
size_t i, len;
for (i = 0, len = 0; i < cplen; i++) {
len += grapheme_encode_utf8(cp[i], NULL, 0);
}
return len;
}
char *
cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen)
{
char *str;
size_t len, i, ret, off;
len = cps_bytelen(cp, cplen);
if (!(str = malloc(len))) {
return NULL;
}
for (i = 0, off = 0; i < cplen; i++, off += ret) {
if ((ret = grapheme_encode_utf8(cp[i], str + off,
len - off)) > (len - off)) {
/* buffer too small */
break;
}
}
str[off] = '\\0';
return str;
}
.Ed
.Sh SEE ALSO
.Xr grapheme_decode_utf8 3 ,
.Xr libgrapheme 7
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,103 @@
cat << EOF
.Dd ${MAN_DATE}
.Dt GRAPHEME_ENCODE_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_encode_utf8
.Nd encode codepoint into UTF-8 string
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_encode_utf8 "uint_least32_t cp" "char *str" "size_t len"
.Sh DESCRIPTION
The
.Fn grapheme_encode_utf8
function encodes the codepoint
.Va cp
into a UTF-8-string.
If
.Va str
is not
.Dv NULL
and
.Va len
is large enough it writes the UTF-8-string to the memory pointed to by
.Va str .
Otherwise no data is written.
.Sh RETURN VALUES
The
.Fn grapheme_encode_utf8
function returns the length (in bytes) of the UTF-8-string resulting
from encoding
.Va cp ,
even if
.Va len
is not large enough or
.Va str
is
.Dv NULL .
.Sh EXAMPLES
.Bd -literal
/* cc (-static) -o example example.c -lgrapheme */
#include <grapheme.h>
#include <stddef.h>
#include <stdlib.h>
size_t
cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len)
{
size_t i, off, ret;
for (i = 0, off = 0; i < cplen; i++, off += ret) {
if ((ret = grapheme_encode_utf8(cp[i], str + off,
len - off)) > (len - off)) {
/* buffer too small */
break;
}
}
return off;
}
size_t
cps_bytelen(const uint_least32_t *cp, size_t cplen)
{
size_t i, len;
for (i = 0, len = 0; i < cplen; i++) {
len += grapheme_encode_utf8(cp[i], NULL, 0);
}
return len;
}
char *
cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen)
{
char *str;
size_t len, i, ret, off;
len = cps_bytelen(cp, cplen);
if (!(str = malloc(len))) {
return NULL;
}
for (i = 0, off = 0; i < cplen; i++, off += ret) {
if ((ret = grapheme_encode_utf8(cp[i], str + off,
len - off)) > (len - off)) {
/* buffer too small */
break;
}
}
str[off] = '\\\\0';
return str;
}
.Ed
.Sh SEE ALSO
.Xr grapheme_decode_utf8 3 ,
.Xr libgrapheme 7
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de
EOF

View File

@@ -0,0 +1,81 @@
.Dd 2022-10-06
.Dt GRAPHEME_IS_CHARACTER_BREAK 3
.Os suckless.org
.Sh NAME
.Nm grapheme_is_character_break
.Nd test for a grapheme cluster break between two codepoints
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "uint_least16_t *state"
.Sh DESCRIPTION
The
.Fn grapheme_is_character_break
function determines if there is a grapheme cluster break (see
.Xr libgrapheme 7 )
between the two codepoints
.Va cp1
and
.Va cp2 .
By specification this decision depends on a
.Va state
that can at most be completely reset after detecting a break and must
be reset every time one deviates from sequential processing.
.Pp
If
.Va state
is
.Dv NULL
.Fn grapheme_is_character_break
behaves as if it was called with a fully reset state.
.Sh RETURN VALUES
The
.Fn grapheme_is_character_break
function returns
.Va true
if there is a grapheme cluster break between the codepoints
.Va cp1
and
.Va cp2
and
.Va false
if there is not.
.Sh EXAMPLES
.Bd -literal
/* cc (-static) -o example example.c -lgrapheme */
#include <grapheme.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
int
main(void)
{
uint_least16_t state = 0;
uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */
size_t i;
for (i = 0; i + 1 < sizeof(s1) / sizeof(*s1); i++) {
if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
printf("break in s1 at offset %zu\n", i);
}
}
memset(&state, 0, sizeof(state)); /* reset state */
for (i = 0; i + 1 < sizeof(s2) / sizeof(*s2); i++) {
if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
printf("break in s2 at offset %zu\n", i);
}
}
return 0;
}
.Ed
.Sh SEE ALSO
.Xr grapheme_next_character_break 3 ,
.Xr grapheme_next_character_break_utf8 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_is_character_break
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,83 @@
cat << EOF
.Dd ${MAN_DATE}
.Dt GRAPHEME_IS_CHARACTER_BREAK 3
.Os suckless.org
.Sh NAME
.Nm grapheme_is_character_break
.Nd test for a grapheme cluster break between two codepoints
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "uint_least16_t *state"
.Sh DESCRIPTION
The
.Fn grapheme_is_character_break
function determines if there is a grapheme cluster break (see
.Xr libgrapheme 7 )
between the two codepoints
.Va cp1
and
.Va cp2 .
By specification this decision depends on a
.Va state
that can at most be completely reset after detecting a break and must
be reset every time one deviates from sequential processing.
.Pp
If
.Va state
is
.Dv NULL
.Fn grapheme_is_character_break
behaves as if it was called with a fully reset state.
.Sh RETURN VALUES
The
.Fn grapheme_is_character_break
function returns
.Va true
if there is a grapheme cluster break between the codepoints
.Va cp1
and
.Va cp2
and
.Va false
if there is not.
.Sh EXAMPLES
.Bd -literal
/* cc (-static) -o example example.c -lgrapheme */
#include <grapheme.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
int
main(void)
{
uint_least16_t state = 0;
uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */
size_t i;
for (i = 0; i + 1 < sizeof(s1) / sizeof(*s1); i++) {
if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
printf("break in s1 at offset %zu\n", i);
}
}
memset(&state, 0, sizeof(state)); /* reset state */
for (i = 0; i + 1 < sizeof(s2) / sizeof(*s2); i++) {
if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
printf("break in s2 at offset %zu\n", i);
}
}
return 0;
}
.Ed
.Sh SEE ALSO
.Xr grapheme_next_character_break 3 ,
.Xr grapheme_next_character_break_utf8 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_is_character_break
is compliant with the Unicode ${UNICODE_VERSION} specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de
EOF

View File

@@ -0,0 +1,51 @@
.Dd 2022-10-06
.Dt GRAPHEME_IS_LOWERCASE 3
.Os suckless.org
.Sh NAME
.Nm grapheme_is_lowercase
.Nd check if codepoint array is lowercase
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_is_lowercase "const uint_least32_t *str" "size_t len" "size_t *caselen"
.Sh DESCRIPTION
The
.Fn grapheme_is_lowercase
function checks if the codepoint array
.Va str
is lowercase and writes the length of the matching lowercase-sequence to the integer pointed to by
.Va caselen ,
unless
.Va caselen
is set to
.Dv NULL .
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the codepoint array
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For UTF-8-encoded input data
.Xr grapheme_is_lowercase_utf8 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_is_lowercase
function returns
.Dv true
if the codepoint array
.Va str
is lowercase, otherwise
.Dv false .
.Sh SEE ALSO
.Xr grapheme_is_lowercase_utf8 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_is_lowercase
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,3 @@
ENCODING="codepoint" \
CASE="lowercase" \
$SH man/template/is_case.sh

View File

@@ -0,0 +1,51 @@
.Dd 2022-10-06
.Dt GRAPHEME_IS_LOWERCASE_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_is_lowercase_utf8
.Nd check if UTF-8-encoded string is lowercase
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_is_lowercase_utf8 "const char *str" "size_t len" "size_t *caselen"
.Sh DESCRIPTION
The
.Fn grapheme_is_lowercase_utf8
function checks if the UTF-8-encoded string
.Va str
is lowercase and writes the length of the matching lowercase-sequence to the integer pointed to by
.Va caselen ,
unless
.Va caselen
is set to
.Dv NULL .
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the UTF-8-encoded string
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For non-UTF-8 input data
.Xr grapheme_is_lowercase 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_is_lowercase_utf8
function returns
.Dv true
if the UTF-8-encoded string
.Va str
is lowercase, otherwise
.Dv false .
.Sh SEE ALSO
.Xr grapheme_is_lowercase 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_is_lowercase_utf8
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,3 @@
ENCODING="utf8" \
CASE="lowercase" \
$SH man/template/is_case.sh

View File

@@ -0,0 +1,51 @@
.Dd 2022-10-06
.Dt GRAPHEME_IS_TITLECASE 3
.Os suckless.org
.Sh NAME
.Nm grapheme_is_titlecase
.Nd check if codepoint array is titlecase
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_is_titlecase "const uint_least32_t *str" "size_t len" "size_t *caselen"
.Sh DESCRIPTION
The
.Fn grapheme_is_titlecase
function checks if the codepoint array
.Va str
is titlecase and writes the length of the matching titlecase-sequence to the integer pointed to by
.Va caselen ,
unless
.Va caselen
is set to
.Dv NULL .
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the codepoint array
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For UTF-8-encoded input data
.Xr grapheme_is_titlecase_utf8 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_is_titlecase
function returns
.Dv true
if the codepoint array
.Va str
is titlecase, otherwise
.Dv false .
.Sh SEE ALSO
.Xr grapheme_is_titlecase_utf8 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_is_titlecase
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,3 @@
ENCODING="codepoint" \
CASE="titlecase" \
$SH man/template/is_case.sh

View File

@@ -0,0 +1,51 @@
.Dd 2022-10-06
.Dt GRAPHEME_IS_TITLECASE_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_is_titlecase_utf8
.Nd check if UTF-8-encoded string is titlecase
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_is_titlecase_utf8 "const char *str" "size_t len" "size_t *caselen"
.Sh DESCRIPTION
The
.Fn grapheme_is_titlecase_utf8
function checks if the UTF-8-encoded string
.Va str
is titlecase and writes the length of the matching titlecase-sequence to the integer pointed to by
.Va caselen ,
unless
.Va caselen
is set to
.Dv NULL .
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the UTF-8-encoded string
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For non-UTF-8 input data
.Xr grapheme_is_titlecase 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_is_titlecase_utf8
function returns
.Dv true
if the UTF-8-encoded string
.Va str
is titlecase, otherwise
.Dv false .
.Sh SEE ALSO
.Xr grapheme_is_titlecase 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_is_titlecase_utf8
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,3 @@
ENCODING="utf8" \
CASE="titlecase" \
$SH man/template/is_case.sh

View File

@@ -0,0 +1,51 @@
.Dd 2022-10-06
.Dt GRAPHEME_IS_UPPERCASE 3
.Os suckless.org
.Sh NAME
.Nm grapheme_is_uppercase
.Nd check if codepoint array is uppercase
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_is_uppercase "const uint_least32_t *str" "size_t len" "size_t *caselen"
.Sh DESCRIPTION
The
.Fn grapheme_is_uppercase
function checks if the codepoint array
.Va str
is uppercase and writes the length of the matching uppercase-sequence to the integer pointed to by
.Va caselen ,
unless
.Va caselen
is set to
.Dv NULL .
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the codepoint array
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For UTF-8-encoded input data
.Xr grapheme_is_uppercase_utf8 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_is_uppercase
function returns
.Dv true
if the codepoint array
.Va str
is uppercase, otherwise
.Dv false .
.Sh SEE ALSO
.Xr grapheme_is_uppercase_utf8 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_is_uppercase
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,3 @@
ENCODING="codepoint" \
CASE="uppercase" \
$SH man/template/is_case.sh

View File

@@ -0,0 +1,51 @@
.Dd 2022-10-06
.Dt GRAPHEME_IS_LOWERCASE_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_is_lowercase_utf8
.Nd check if UTF-8-encoded string is lowercase
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_is_lowercase_utf8 "const char *str" "size_t len" "size_t *caselen"
.Sh DESCRIPTION
The
.Fn grapheme_is_lowercase_utf8
function checks if the UTF-8-encoded string
.Va str
is lowercase and writes the length of the matching lowercase-sequence to the integer pointed to by
.Va caselen ,
unless
.Va caselen
is set to
.Dv NULL .
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the UTF-8-encoded string
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For non-UTF-8 input data
.Xr grapheme_is_lowercase 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_is_lowercase_utf8
function returns
.Dv true
if the UTF-8-encoded string
.Va str
is lowercase, otherwise
.Dv false .
.Sh SEE ALSO
.Xr grapheme_is_lowercase 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_is_lowercase_utf8
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,3 @@
ENCODING="utf8" \
CASE="lowercase" \
$SH man/template/is_case.sh

View File

@@ -0,0 +1,56 @@
.Dd 2022-10-06
.Dt GRAPHEME_NEXT_CHARACTER_BREAK 3
.Os suckless.org
.Sh NAME
.Nm grapheme_next_character_break
.Nd determine codepoint-offset to next grapheme cluster break
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_next_character_break "const uint_least32_t *str" "size_t len"
.Sh DESCRIPTION
The
.Fn grapheme_next_character_break
function computes the offset (in codepoints) to the next grapheme cluster
break (see
.Xr libgrapheme 7 )
in the codepoint array
.Va str
of length
.Va len .
If a grapheme cluster begins at
.Va str
this offset is equal to the length of said grapheme cluster.
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the string
.Va str
is interpreted to be NUL-terminated and processing stops when
a codepoint with the value 0 is encountered.
.Pp
For UTF-8-encoded input
data
.Xr grapheme_next_character_break_utf8 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_next_character_break
function returns the offset (in codepoints) to the next grapheme cluster
break in
.Va str
or 0 if
.Va str
is
.Dv NULL .
.Sh SEE ALSO
.Xr grapheme_is_character_break 3 ,
.Xr grapheme_next_character_break_utf8 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_next_character_break
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,4 @@
ENCODING="codepoint" \
TYPE="character" \
REALTYPE="grapheme cluster" \
$SH man/template/next_break.sh

View File

@@ -0,0 +1,94 @@
.Dd 2022-10-06
.Dt GRAPHEME_NEXT_CHARACTER_BREAK_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_next_character_break_utf8
.Nd determine byte-offset to next grapheme cluster break
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_next_character_break_utf8 "const char *str" "size_t len"
.Sh DESCRIPTION
The
.Fn grapheme_next_character_break_utf8
function computes the offset (in bytes) to the next grapheme cluster
break (see
.Xr libgrapheme 7 )
in the UTF-8-encoded string
.Va str
of length
.Va len .
If a grapheme cluster begins at
.Va str
this offset is equal to the length of said grapheme cluster.
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the string
.Va str
is interpreted to be NUL-terminated and processing stops when
a NUL-byte is encountered.
.Pp
For non-UTF-8 input
data
.Xr grapheme_is_character_break 3 and
.Xr grapheme_next_character_break 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_next_character_break_utf8
function returns the offset (in bytes) to the next grapheme cluster
break in
.Va str
or 0 if
.Va str
is
.Dv NULL .
.Sh EXAMPLES
.Bd -literal
/* cc (-static) -o example example.c -lgrapheme */
#include <grapheme.h>
#include <stdint.h>
#include <stdio.h>
int
main(void)
{
/* UTF-8 encoded input */
char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
"\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
"\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
"\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
size_t ret, len, off;
printf("Input: \\"%s\\"\\n", s);
/* print each grapheme cluster with byte-length */
printf("grapheme clusters in NUL-delimited input:\\n");
for (off = 0; s[off] != '\\0'; off += ret) {
ret = grapheme_next_character_break_utf8(s + off, SIZE_MAX);
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
}
printf("\\n");
/* do the same, but this time string is length-delimited */
len = 17;
printf("grapheme clusters in input delimited to %zu bytes:\\n", len);
for (off = 0; off < len; off += ret) {
ret = grapheme_next_character_break_utf8(s + off, len - off);
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
}
return 0;
}
.Ed
.Sh SEE ALSO
.Xr grapheme_next_character_break 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_next_character_break_utf8
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,4 @@
ENCODING="utf8" \
TYPE="character" \
REALTYPE="grapheme cluster" \
$SH man/template/next_break.sh

View File

@@ -0,0 +1,52 @@
.Dd 2022-10-06
.Dt GRAPHEME_NEXT_LINE_BREAK 3
.Os suckless.org
.Sh NAME
.Nm grapheme_next_line_break
.Nd determine codepoint-offset to next possible line break
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_next_line_break "const uint_least32_t *str" "size_t len"
.Sh DESCRIPTION
The
.Fn grapheme_next_line_break
function computes the offset (in codepoints) to the next possible line
break (see
.Xr libgrapheme 7 )
in the codepoint array
.Va str
of length
.Va len .
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the string
.Va str
is interpreted to be NUL-terminated and processing stops when
a codepoint with the value 0 is encountered.
.Pp
For UTF-8-encoded input
data
.Xr grapheme_next_line_break_utf8 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_next_line_break
function returns the offset (in codepoints) to the next possible line
break in
.Va str
or 0 if
.Va str
is
.Dv NULL .
.Sh SEE ALSO
.Xr grapheme_next_line_break_utf8 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_next_line_break
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,4 @@
ENCODING="codepoint" \
TYPE="line" \
REALTYPE="possible line" \
$SH man/template/next_break.sh

View File

@@ -0,0 +1,90 @@
.Dd 2022-10-06
.Dt GRAPHEME_NEXT_LINE_BREAK_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_next_line_break_utf8
.Nd determine byte-offset to next possible line break
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_next_line_break_utf8 "const char *str" "size_t len"
.Sh DESCRIPTION
The
.Fn grapheme_next_line_break_utf8
function computes the offset (in bytes) to the next possible line
break (see
.Xr libgrapheme 7 )
in the UTF-8-encoded string
.Va str
of length
.Va len .
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the string
.Va str
is interpreted to be NUL-terminated and processing stops when
a NUL-byte is encountered.
.Pp
For non-UTF-8 input
data
.Xr grapheme_next_line_break 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_next_line_break_utf8
function returns the offset (in bytes) to the next possible line
break in
.Va str
or 0 if
.Va str
is
.Dv NULL .
.Sh EXAMPLES
.Bd -literal
/* cc (-static) -o example example.c -lgrapheme */
#include <grapheme.h>
#include <stdint.h>
#include <stdio.h>
int
main(void)
{
/* UTF-8 encoded input */
char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
"\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
"\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
"\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
size_t ret, len, off;
printf("Input: \\"%s\\"\\n", s);
/* print each possible line with byte-length */
printf("possible lines in NUL-delimited input:\\n");
for (off = 0; s[off] != '\\0'; off += ret) {
ret = grapheme_next_line_break_utf8(s + off, SIZE_MAX);
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
}
printf("\\n");
/* do the same, but this time string is length-delimited */
len = 17;
printf("possible lines in input delimited to %zu bytes:\\n", len);
for (off = 0; off < len; off += ret) {
ret = grapheme_next_line_break_utf8(s + off, len - off);
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
}
return 0;
}
.Ed
.Sh SEE ALSO
.Xr grapheme_next_line_break 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_next_line_break_utf8
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,4 @@
ENCODING="utf8" \
TYPE="line" \
REALTYPE="possible line" \
$SH man/template/next_break.sh

View File

@@ -0,0 +1,55 @@
.Dd 2022-10-06
.Dt GRAPHEME_NEXT_SENTENCE_BREAK 3
.Os suckless.org
.Sh NAME
.Nm grapheme_next_sentence_break
.Nd determine codepoint-offset to next sentence break
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_next_sentence_break "const uint_least32_t *str" "size_t len"
.Sh DESCRIPTION
The
.Fn grapheme_next_sentence_break
function computes the offset (in codepoints) to the next sentence
break (see
.Xr libgrapheme 7 )
in the codepoint array
.Va str
of length
.Va len .
If a sentence begins at
.Va str
this offset is equal to the length of said sentence.
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the string
.Va str
is interpreted to be NUL-terminated and processing stops when
a codepoint with the value 0 is encountered.
.Pp
For UTF-8-encoded input
data
.Xr grapheme_next_sentence_break_utf8 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_next_sentence_break
function returns the offset (in codepoints) to the next sentence
break in
.Va str
or 0 if
.Va str
is
.Dv NULL .
.Sh SEE ALSO
.Xr grapheme_next_sentence_break_utf8 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_next_sentence_break
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,4 @@
ENCODING="codepoint" \
TYPE="sentence" \
REALTYPE="sentence" \
$SH man/template/next_break.sh

View File

@@ -0,0 +1,93 @@
.Dd 2022-10-06
.Dt GRAPHEME_NEXT_SENTENCE_BREAK_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_next_sentence_break_utf8
.Nd determine byte-offset to next sentence break
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_next_sentence_break_utf8 "const char *str" "size_t len"
.Sh DESCRIPTION
The
.Fn grapheme_next_sentence_break_utf8
function computes the offset (in bytes) to the next sentence
break (see
.Xr libgrapheme 7 )
in the UTF-8-encoded string
.Va str
of length
.Va len .
If a sentence begins at
.Va str
this offset is equal to the length of said sentence.
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the string
.Va str
is interpreted to be NUL-terminated and processing stops when
a NUL-byte is encountered.
.Pp
For non-UTF-8 input
data
.Xr grapheme_next_sentence_break 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_next_sentence_break_utf8
function returns the offset (in bytes) to the next sentence
break in
.Va str
or 0 if
.Va str
is
.Dv NULL .
.Sh EXAMPLES
.Bd -literal
/* cc (-static) -o example example.c -lgrapheme */
#include <grapheme.h>
#include <stdint.h>
#include <stdio.h>
int
main(void)
{
/* UTF-8 encoded input */
char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
"\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
"\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
"\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
size_t ret, len, off;
printf("Input: \\"%s\\"\\n", s);
/* print each sentence with byte-length */
printf("sentences in NUL-delimited input:\\n");
for (off = 0; s[off] != '\\0'; off += ret) {
ret = grapheme_next_sentence_break_utf8(s + off, SIZE_MAX);
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
}
printf("\\n");
/* do the same, but this time string is length-delimited */
len = 17;
printf("sentences in input delimited to %zu bytes:\\n", len);
for (off = 0; off < len; off += ret) {
ret = grapheme_next_sentence_break_utf8(s + off, len - off);
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
}
return 0;
}
.Ed
.Sh SEE ALSO
.Xr grapheme_next_sentence_break 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_next_sentence_break_utf8
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,4 @@
ENCODING="utf8" \
TYPE="sentence" \
REALTYPE="sentence" \
$SH man/template/next_break.sh

View File

@@ -0,0 +1,55 @@
.Dd 2022-10-06
.Dt GRAPHEME_NEXT_WORD_BREAK 3
.Os suckless.org
.Sh NAME
.Nm grapheme_next_word_break
.Nd determine codepoint-offset to next word break
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_next_word_break "const uint_least32_t *str" "size_t len"
.Sh DESCRIPTION
The
.Fn grapheme_next_word_break
function computes the offset (in codepoints) to the next word
break (see
.Xr libgrapheme 7 )
in the codepoint array
.Va str
of length
.Va len .
If a word begins at
.Va str
this offset is equal to the length of said word.
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the string
.Va str
is interpreted to be NUL-terminated and processing stops when
a codepoint with the value 0 is encountered.
.Pp
For UTF-8-encoded input
data
.Xr grapheme_next_word_break_utf8 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_next_word_break
function returns the offset (in codepoints) to the next word
break in
.Va str
or 0 if
.Va str
is
.Dv NULL .
.Sh SEE ALSO
.Xr grapheme_next_word_break_utf8 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_next_word_break
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,4 @@
ENCODING="codepoint" \
TYPE="word" \
REALTYPE="word" \
$SH man/template/next_break.sh

View File

@@ -0,0 +1,93 @@
.Dd 2022-10-06
.Dt GRAPHEME_NEXT_WORD_BREAK_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_next_word_break_utf8
.Nd determine byte-offset to next word break
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_next_word_break_utf8 "const char *str" "size_t len"
.Sh DESCRIPTION
The
.Fn grapheme_next_word_break_utf8
function computes the offset (in bytes) to the next word
break (see
.Xr libgrapheme 7 )
in the UTF-8-encoded string
.Va str
of length
.Va len .
If a word begins at
.Va str
this offset is equal to the length of said word.
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the string
.Va str
is interpreted to be NUL-terminated and processing stops when
a NUL-byte is encountered.
.Pp
For non-UTF-8 input
data
.Xr grapheme_next_word_break 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_next_word_break_utf8
function returns the offset (in bytes) to the next word
break in
.Va str
or 0 if
.Va str
is
.Dv NULL .
.Sh EXAMPLES
.Bd -literal
/* cc (-static) -o example example.c -lgrapheme */
#include <grapheme.h>
#include <stdint.h>
#include <stdio.h>
int
main(void)
{
/* UTF-8 encoded input */
char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
"\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
"\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
"\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
size_t ret, len, off;
printf("Input: \\"%s\\"\\n", s);
/* print each word with byte-length */
printf("words in NUL-delimited input:\\n");
for (off = 0; s[off] != '\\0'; off += ret) {
ret = grapheme_next_word_break_utf8(s + off, SIZE_MAX);
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
}
printf("\\n");
/* do the same, but this time string is length-delimited */
len = 17;
printf("words in input delimited to %zu bytes:\\n", len);
for (off = 0; off < len; off += ret) {
ret = grapheme_next_word_break_utf8(s + off, len - off);
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
}
return 0;
}
.Ed
.Sh SEE ALSO
.Xr grapheme_next_word_break 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_next_word_break_utf8
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,4 @@
ENCODING="utf8" \
TYPE="word" \
REALTYPE="word" \
$SH man/template/next_break.sh

View File

@@ -0,0 +1,56 @@
.Dd 2022-10-06
.Dt GRAPHEME_TO_LOWERCASE 3
.Os suckless.org
.Sh NAME
.Nm grapheme_to_lowercase
.Nd convert codepoint array to lowercase
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_to_lowercase "const uint_least32_t *src" "size_t srclen" "uint_least32_t *dest" "size_t destlen"
.Sh DESCRIPTION
The
.Fn grapheme_to_lowercase
function converts the codepoint array
.Va str
to lowercase and writes the result to
.Va dest
up to
.Va destlen ,
unless
.Va dest
is set to
.Dv NULL .
.Pp
If
.Va srclen
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the codepoint array
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For UTF-8-encoded input data
.Xr grapheme_to_lowercase_utf8 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_to_lowercase
function returns the number of codepoints in the array resulting
from converting
.Va src
to lowercase, even if
.Va destlen
is not large enough or
.Va dest
is
.Dv NULL .
.Sh SEE ALSO
.Xr grapheme_to_lowercase_utf8 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_to_lowercase
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,3 @@
ENCODING="codepoint" \
CASE="lowercase" \
$SH man/template/to_case.sh

View File

@@ -0,0 +1,56 @@
.Dd 2022-10-06
.Dt GRAPHEME_TO_LOWERCASE_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_to_lowercase_utf8
.Nd convert UTF-8-encoded string to lowercase
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_to_lowercase_utf8 "const char *src" "size_t srclen" "char *dest" "size_t destlen"
.Sh DESCRIPTION
The
.Fn grapheme_to_lowercase_utf8
function converts the UTF-8-encoded string
.Va str
to lowercase and writes the result to
.Va dest
up to
.Va destlen ,
unless
.Va dest
is set to
.Dv NULL .
.Pp
If
.Va srclen
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the UTF-8-encoded string
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For non-UTF-8 input data
.Xr grapheme_to_lowercase 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_to_lowercase_utf8
function returns the number of bytes in the array resulting
from converting
.Va src
to lowercase, even if
.Va destlen
is not large enough or
.Va dest
is
.Dv NULL .
.Sh SEE ALSO
.Xr grapheme_to_lowercase 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_to_lowercase_utf8
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,3 @@
ENCODING="utf8" \
CASE="lowercase" \
$SH man/template/to_case.sh

View File

@@ -0,0 +1,56 @@
.Dd 2022-10-06
.Dt GRAPHEME_TO_TITLECASE 3
.Os suckless.org
.Sh NAME
.Nm grapheme_to_titlecase
.Nd convert codepoint array to titlecase
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_to_titlecase "const uint_least32_t *src" "size_t srclen" "uint_least32_t *dest" "size_t destlen"
.Sh DESCRIPTION
The
.Fn grapheme_to_titlecase
function converts the codepoint array
.Va str
to titlecase and writes the result to
.Va dest
up to
.Va destlen ,
unless
.Va dest
is set to
.Dv NULL .
.Pp
If
.Va srclen
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the codepoint array
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For UTF-8-encoded input data
.Xr grapheme_to_titlecase_utf8 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_to_titlecase
function returns the number of codepoints in the array resulting
from converting
.Va src
to titlecase, even if
.Va destlen
is not large enough or
.Va dest
is
.Dv NULL .
.Sh SEE ALSO
.Xr grapheme_to_titlecase_utf8 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_to_titlecase
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,3 @@
ENCODING="codepoint" \
CASE="titlecase" \
$SH man/template/to_case.sh

View File

@@ -0,0 +1,56 @@
.Dd 2022-10-06
.Dt GRAPHEME_TO_TITLECASE_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_to_titlecase_utf8
.Nd convert UTF-8-encoded string to titlecase
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_to_titlecase_utf8 "const char *src" "size_t srclen" "char *dest" "size_t destlen"
.Sh DESCRIPTION
The
.Fn grapheme_to_titlecase_utf8
function converts the UTF-8-encoded string
.Va str
to titlecase and writes the result to
.Va dest
up to
.Va destlen ,
unless
.Va dest
is set to
.Dv NULL .
.Pp
If
.Va srclen
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the UTF-8-encoded string
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For non-UTF-8 input data
.Xr grapheme_to_titlecase 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_to_titlecase_utf8
function returns the number of bytes in the array resulting
from converting
.Va src
to titlecase, even if
.Va destlen
is not large enough or
.Va dest
is
.Dv NULL .
.Sh SEE ALSO
.Xr grapheme_to_titlecase 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_to_titlecase_utf8
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,3 @@
ENCODING="utf8" \
CASE="titlecase" \
$SH man/template/to_case.sh

View File

@@ -0,0 +1,56 @@
.Dd 2022-10-06
.Dt GRAPHEME_TO_UPPERCASE 3
.Os suckless.org
.Sh NAME
.Nm grapheme_to_uppercase
.Nd convert codepoint array to uppercase
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_to_uppercase "const uint_least32_t *src" "size_t srclen" "uint_least32_t *dest" "size_t destlen"
.Sh DESCRIPTION
The
.Fn grapheme_to_uppercase
function converts the codepoint array
.Va str
to uppercase and writes the result to
.Va dest
up to
.Va destlen ,
unless
.Va dest
is set to
.Dv NULL .
.Pp
If
.Va srclen
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the codepoint array
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For UTF-8-encoded input data
.Xr grapheme_to_uppercase_utf8 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_to_uppercase
function returns the number of codepoints in the array resulting
from converting
.Va src
to uppercase, even if
.Va destlen
is not large enough or
.Va dest
is
.Dv NULL .
.Sh SEE ALSO
.Xr grapheme_to_uppercase_utf8 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_to_uppercase
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,3 @@
ENCODING="codepoint" \
CASE="uppercase" \
$SH man/template/to_case.sh

View File

@@ -0,0 +1,56 @@
.Dd 2022-10-06
.Dt GRAPHEME_TO_LOWERCASE_UTF8 3
.Os suckless.org
.Sh NAME
.Nm grapheme_to_lowercase_utf8
.Nd convert UTF-8-encoded string to lowercase
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_to_lowercase_utf8 "const char *src" "size_t srclen" "char *dest" "size_t destlen"
.Sh DESCRIPTION
The
.Fn grapheme_to_lowercase_utf8
function converts the UTF-8-encoded string
.Va str
to lowercase and writes the result to
.Va dest
up to
.Va destlen ,
unless
.Va dest
is set to
.Dv NULL .
.Pp
If
.Va srclen
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the UTF-8-encoded string
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For non-UTF-8 input data
.Xr grapheme_to_lowercase 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_to_lowercase_utf8
function returns the number of bytes in the array resulting
from converting
.Va src
to lowercase, even if
.Va destlen
is not large enough or
.Va dest
is
.Dv NULL .
.Sh SEE ALSO
.Xr grapheme_to_lowercase 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_to_lowercase_utf8
is compliant with the Unicode 15.0.0 specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,3 @@
ENCODING="utf8" \
CASE="lowercase" \
$SH man/template/to_case.sh

View File

@@ -0,0 +1,165 @@
.Dd 2022-10-06
.Dt LIBGRAPHEME 7
.Os suckless.org
.Sh NAME
.Nm libgrapheme
.Nd unicode string library
.Sh SYNOPSIS
.In grapheme.h
.Sh DESCRIPTION
The
.Nm
library provides functions to properly handle Unicode strings according
to the Unicode specification in regard to character, word, sentence and
line segmentation and case detection and conversion.
.Pp
Unicode strings are made up of user-perceived characters (so-called
.Dq grapheme clusters ,
see
.Sx MOTIVATION )
that are composed of one or more Unicode codepoints, which in turn
are encoded in one or more bytes in an encoding like UTF-8.
.Pp
There is a widespread misconception that it was enough to simply
determine codepoints in a string and treat them as user-perceived
characters to be Unicode compliant.
While this may work in some cases, this assumption quickly breaks,
especially for non-Western languages and decomposed Unicode strings
where user-perceived characters are usually represented using multiple
codepoints.
.Pp
Despite this complicated multilevel structure of Unicode strings,
.Nm
provides methods to work with them at the byte-level (i.e. UTF-8
.Sq char
arrays) while also offering codepoint-level methods.
Additionally, it is a
.Dq freestanding
library (see ISO/IEC 9899:1999 section 4.6) and thus does not depend on
a standard library. This makes it easy to use in bare metal environments.
.Pp
Every documented function's manual page provides a self-contained
example illustrating the possible usage.
.Sh SEE ALSO
.Xr grapheme_decode_utf8 3 ,
.Xr grapheme_encode_utf8 3 ,
.Xr grapheme_is_character_break 3 ,
.Xr grapheme_is_lowercase 3 ,
.Xr grapheme_is_lowercase_utf8 3 ,
.Xr grapheme_is_titlecase 3 ,
.Xr grapheme_is_titlecase_utf8 3 ,
.Xr grapheme_is_uppercase 3 ,
.Xr grapheme_is_uppercase_utf8 3 ,
.Xr grapheme_next_character_break 3 ,
.Xr grapheme_next_character_break_utf8 3 ,
.Xr grapheme_next_line_break 3 ,
.Xr grapheme_next_line_break_utf8 3 ,
.Xr grapheme_next_sentence_break 3 ,
.Xr grapheme_next_sentence_break_utf8 3 ,
.Xr grapheme_next_word_break 3 ,
.Xr grapheme_next_word_break_utf8 3 ,
.Xr grapheme_to_lowercase 3 ,
.Xr grapheme_to_lowercase_utf8 3 ,
.Xr grapheme_to_titlecase 3 ,
.Xr grapheme_to_titlecase_utf8 3
.Xr grapheme_to_uppercase 3 ,
.Xr grapheme_to_uppercase_utf8 3 ,
.Sh STANDARDS
.Nm
is compliant with the Unicode 15.0.0 specification.
.Sh MOTIVATION
The idea behind every character encoding scheme like ASCII or Unicode
is to express abstract characters (which can be thought of as shapes
making up a written language). ASCII for instance, which comprises the
range 0 to 127, assigns the number 65 (0x41) to the abstract character
.Sq A .
This number is called a
.Dq codepoint ,
and all codepoints of an encoding make up its so-called
.Dq code space .
.Pp
Unicode's code space is much larger, ranging from 0 to 0x10FFFF, but its
first 128 codepoints are identical to ASCII's. The additional code
points are needed as Unicode's goal is to express all writing systems
of the world.
To give an example, the abstract character
.Sq \[u00C4]
is not expressable in ASCII, given no ASCII codepoint has been assigned
to it.
It can be expressed in Unicode, though, with the codepoint 196 (0xC4).
.Pp
One may assume that this process is straightfoward, but as more and
more codepoints were assigned to abstract characters, the Unicode
Consortium (that defines the Unicode standard) was facing a problem:
Many (mostly non-European) languages have such a large amount of
abstract characters that it would exhaust the available Unicode code
space if one tried to assign a codepoint to each abstract character.
The solution to that problem is best introduced with an example: Consider
the abstract character
.Sq \[u01DE] ,
which is
.Sq A
with an umlaut and a macron added to it.
In this sense, one can consider
.Sq \[u01DE]
as a two-fold modification (namely
.Dq add umlaut
and
.Dq add macron )
of the
.Dq base character
.Sq A .
.Pp
The Unicode Consortium adapted this idea by assigning codepoints to
modifications.
For example, the codepoint 0x308 represents adding an umlaut and 0x304
represents adding a macron, and thus, the codepoint sequence
.Dq 0x41 0x308 0x304 ,
namely the base character
.Sq A
followed by the umlaut and macron modifiers, represents the abstract
character
.Sq \[u01DE] .
As a side-note, the single codepoint 0x1DE was also assigned to
.Sq \[u01DE] ,
which is a good example for the fact that there can be multiple
representations of a single abstract character in Unicode.
.Pp
Expressing a single abstract character with multiple codepoints solved
the code space exhaustion-problem, and the concept has been greatly
expanded since its first introduction (emojis, joiners, etc.). A sequence
(which can also have the length 1) of codepoints that belong together
this way and represents an abstract character is called a
.Dq grapheme cluster .
.Pp
In many applications it is necessary to count the number of
user-perceived characters, i.e. grapheme clusters, in a string.
A good example for this is a terminal text editor, which needs to
properly align characters on a grid.
This is pretty simple with ASCII-strings, where you just count the number
of bytes (as each byte is a codepoint and each codepoint is a grapheme
cluster).
With Unicode-strings, it is a common mistake to simply adapt the
ASCII-approach and count the number of code points.
This is wrong, as, for example, the sequence
.Dq 0x41 0x308 0x304 ,
while made up of 3 codepoints, is a single grapheme cluster and
represents the user-perceived character
.Sq \[u01DE] .
.Pp
The proper way to segment a string into user-perceived characters
is to segment it into its grapheme clusters by applying the Unicode
grapheme cluster breaking algorithm (UAX #29).
It is based on a complex ruleset and lookup-tables and determines if a
grapheme cluster ends or is continued between two codepoints.
Libraries like ICU and libunistring, which also offer this functionality,
are often bloated, not correct, difficult to use or not reasonably
statically linkable.
.Pp
Analogously, the standard provides algorithms to separate strings by
words, sentences and lines, convert cases and compare strings.
The motivation behind
.Nm
is to make unicode handling suck less and abide by the UNIX philosophy.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de

View File

@@ -0,0 +1,167 @@
cat << EOF
.Dd ${MAN_DATE}
.Dt LIBGRAPHEME 7
.Os suckless.org
.Sh NAME
.Nm libgrapheme
.Nd unicode string library
.Sh SYNOPSIS
.In grapheme.h
.Sh DESCRIPTION
The
.Nm
library provides functions to properly handle Unicode strings according
to the Unicode specification in regard to character, word, sentence and
line segmentation and case detection and conversion.
.Pp
Unicode strings are made up of user-perceived characters (so-called
.Dq grapheme clusters ,
see
.Sx MOTIVATION )
that are composed of one or more Unicode codepoints, which in turn
are encoded in one or more bytes in an encoding like UTF-8.
.Pp
There is a widespread misconception that it was enough to simply
determine codepoints in a string and treat them as user-perceived
characters to be Unicode compliant.
While this may work in some cases, this assumption quickly breaks,
especially for non-Western languages and decomposed Unicode strings
where user-perceived characters are usually represented using multiple
codepoints.
.Pp
Despite this complicated multilevel structure of Unicode strings,
.Nm
provides methods to work with them at the byte-level (i.e. UTF-8
.Sq char
arrays) while also offering codepoint-level methods.
Additionally, it is a
.Dq freestanding
library (see ISO/IEC 9899:1999 section 4.6) and thus does not depend on
a standard library. This makes it easy to use in bare metal environments.
.Pp
Every documented function's manual page provides a self-contained
example illustrating the possible usage.
.Sh SEE ALSO
.Xr grapheme_decode_utf8 3 ,
.Xr grapheme_encode_utf8 3 ,
.Xr grapheme_is_character_break 3 ,
.Xr grapheme_is_lowercase 3 ,
.Xr grapheme_is_lowercase_utf8 3 ,
.Xr grapheme_is_titlecase 3 ,
.Xr grapheme_is_titlecase_utf8 3 ,
.Xr grapheme_is_uppercase 3 ,
.Xr grapheme_is_uppercase_utf8 3 ,
.Xr grapheme_next_character_break 3 ,
.Xr grapheme_next_character_break_utf8 3 ,
.Xr grapheme_next_line_break 3 ,
.Xr grapheme_next_line_break_utf8 3 ,
.Xr grapheme_next_sentence_break 3 ,
.Xr grapheme_next_sentence_break_utf8 3 ,
.Xr grapheme_next_word_break 3 ,
.Xr grapheme_next_word_break_utf8 3 ,
.Xr grapheme_to_lowercase 3 ,
.Xr grapheme_to_lowercase_utf8 3 ,
.Xr grapheme_to_titlecase 3 ,
.Xr grapheme_to_titlecase_utf8 3
.Xr grapheme_to_uppercase 3 ,
.Xr grapheme_to_uppercase_utf8 3 ,
.Sh STANDARDS
.Nm
is compliant with the Unicode ${UNICODE_VERSION} specification.
.Sh MOTIVATION
The idea behind every character encoding scheme like ASCII or Unicode
is to express abstract characters (which can be thought of as shapes
making up a written language). ASCII for instance, which comprises the
range 0 to 127, assigns the number 65 (0x41) to the abstract character
.Sq A .
This number is called a
.Dq codepoint ,
and all codepoints of an encoding make up its so-called
.Dq code space .
.Pp
Unicode's code space is much larger, ranging from 0 to 0x10FFFF, but its
first 128 codepoints are identical to ASCII's. The additional code
points are needed as Unicode's goal is to express all writing systems
of the world.
To give an example, the abstract character
.Sq \[u00C4]
is not expressable in ASCII, given no ASCII codepoint has been assigned
to it.
It can be expressed in Unicode, though, with the codepoint 196 (0xC4).
.Pp
One may assume that this process is straightfoward, but as more and
more codepoints were assigned to abstract characters, the Unicode
Consortium (that defines the Unicode standard) was facing a problem:
Many (mostly non-European) languages have such a large amount of
abstract characters that it would exhaust the available Unicode code
space if one tried to assign a codepoint to each abstract character.
The solution to that problem is best introduced with an example: Consider
the abstract character
.Sq \[u01DE] ,
which is
.Sq A
with an umlaut and a macron added to it.
In this sense, one can consider
.Sq \[u01DE]
as a two-fold modification (namely
.Dq add umlaut
and
.Dq add macron )
of the
.Dq base character
.Sq A .
.Pp
The Unicode Consortium adapted this idea by assigning codepoints to
modifications.
For example, the codepoint 0x308 represents adding an umlaut and 0x304
represents adding a macron, and thus, the codepoint sequence
.Dq 0x41 0x308 0x304 ,
namely the base character
.Sq A
followed by the umlaut and macron modifiers, represents the abstract
character
.Sq \[u01DE] .
As a side-note, the single codepoint 0x1DE was also assigned to
.Sq \[u01DE] ,
which is a good example for the fact that there can be multiple
representations of a single abstract character in Unicode.
.Pp
Expressing a single abstract character with multiple codepoints solved
the code space exhaustion-problem, and the concept has been greatly
expanded since its first introduction (emojis, joiners, etc.). A sequence
(which can also have the length 1) of codepoints that belong together
this way and represents an abstract character is called a
.Dq grapheme cluster .
.Pp
In many applications it is necessary to count the number of
user-perceived characters, i.e. grapheme clusters, in a string.
A good example for this is a terminal text editor, which needs to
properly align characters on a grid.
This is pretty simple with ASCII-strings, where you just count the number
of bytes (as each byte is a codepoint and each codepoint is a grapheme
cluster).
With Unicode-strings, it is a common mistake to simply adapt the
ASCII-approach and count the number of code points.
This is wrong, as, for example, the sequence
.Dq 0x41 0x308 0x304 ,
while made up of 3 codepoints, is a single grapheme cluster and
represents the user-perceived character
.Sq \[u01DE] .
.Pp
The proper way to segment a string into user-perceived characters
is to segment it into its grapheme clusters by applying the Unicode
grapheme cluster breaking algorithm (UAX #29).
It is based on a complex ruleset and lookup-tables and determines if a
grapheme cluster ends or is continued between two codepoints.
Libraries like ICU and libunistring, which also offer this functionality,
are often bloated, not correct, difficult to use or not reasonably
statically linkable.
.Pp
Analogously, the standard provides algorithms to separate strings by
words, sentences and lines, convert cases and compare strings.
The motivation behind
.Nm
is to make unicode handling suck less and abide by the UNIX philosophy.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de
EOF

View File

@@ -0,0 +1,67 @@
if [ "$ENCODING" = "utf8" ]; then
UNIT="byte"
ARRAYTYPE="UTF-8-encoded string"
SUFFIX="_utf8"
ANTISUFFIX=""
DATATYPE="char"
else
UNIT="codepoint"
ARRAYTYPE="codepoint array"
SUFFIX=""
ANTISUFFIX="_utf8"
DATATYPE="uint_least32_t"
fi
cat << EOF
.Dd ${MAN_DATE}
.Dt GRAPHEME_IS_$(printf "%s%s" "$CASE" "$SUFFIX" | tr [:lower:] [:upper:]) 3
.Os suckless.org
.Sh NAME
.Nm grapheme_is_${CASE}${SUFFIX}
.Nd check if ${ARRAYTYPE} is ${CASE}
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_is_${CASE}${SUFFIX} "const ${DATATYPE} *str" "size_t len" "size_t *caselen"
.Sh DESCRIPTION
The
.Fn grapheme_is_${CASE}${SUFFIX}
function checks if the ${ARRAYTYPE}
.Va str
is ${CASE} and writes the length of the matching ${CASE}-sequence to the integer pointed to by
.Va caselen ,
unless
.Va caselen
is set to
.Dv NULL .
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the ${ARRAYTYPE}
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input data
.Xr grapheme_is_${CASE}${ANTISUFFIX} 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_is_${CASE}${SUFFIX}
function returns
.Dv true
if the ${ARRAYTYPE}
.Va str
is ${CASE}, otherwise
.Dv false .
.Sh SEE ALSO
.Xr grapheme_is_${CASE}${ANTISUFFIX} 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_is_${CASE}${SUFFIX}
is compliant with the Unicode ${UNICODE_VERSION} specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de
EOF

View File

@@ -0,0 +1,112 @@
if [ "$ENCODING" = "utf8" ]; then
UNIT="byte"
SUFFIX="_utf8"
ANTISUFFIX=""
else
UNIT="codepoint"
SUFFIX=""
ANTISUFFIX="_utf8"
fi
cat << EOF
.Dd ${MAN_DATE}
.Dt GRAPHEME_NEXT_$(printf "%s_break%s" "$TYPE" "$SUFFIX" | tr [:lower:] [:upper:]) 3
.Os suckless.org
.Sh NAME
.Nm grapheme_next_${TYPE}_break${SUFFIX}
.Nd determine ${UNIT}-offset to next ${REALTYPE} break
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_next_${TYPE}_break${SUFFIX} "const $(if [ "$ENCODING" = "utf8" ]; then printf "char"; else printf "uint_least32_t"; fi) *str" "size_t len"
.Sh DESCRIPTION
The
.Fn grapheme_next_${TYPE}_break${SUFFIX}
function computes the offset (in ${UNIT}s) to the next ${REALTYPE}
break (see
.Xr libgrapheme 7 )
in the $(if [ "$ENCODING" = "utf8" ]; then printf "UTF-8-encoded string"; else printf "codepoint array"; fi)
.Va str
of length
.Va len .$(if [ "$TYPE" != "line" ]; then printf "\nIf a ${REALTYPE} begins at
.Va str
this offset is equal to the length of said ${REALTYPE}."; fi)
.Pp
If
.Va len
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the string
.Va str
is interpreted to be NUL-terminated and processing stops when
a $(if [ "$ENCODING" = "utf8" ]; then printf "NUL-byte"; else printf "codepoint with the value 0"; fi) is encountered.
.Pp
For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input
data$(if [ "$TYPE" = "character" ] && [ "$ENCODING" = "utf8" ]; then printf "\n.Xr grapheme_is_character_break 3 and"; fi)
.Xr grapheme_next_${TYPE}_break${ANTISUFFIX} 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_next_${TYPE}_break${SUFFIX}
function returns the offset (in ${UNIT}s) to the next ${REALTYPE}
break in
.Va str
or 0 if
.Va str
is
.Dv NULL .
EOF
if [ "$ENCODING" = "utf8" ]; then
cat << EOF
.Sh EXAMPLES
.Bd -literal
/* cc (-static) -o example example.c -lgrapheme */
#include <grapheme.h>
#include <stdint.h>
#include <stdio.h>
int
main(void)
{
/* UTF-8 encoded input */
char *s = "T\\\\xC3\\\\xABst \\\\xF0\\\\x9F\\\\x91\\\\xA8\\\\xE2\\\\x80\\\\x8D\\\\xF0"
"\\\\x9F\\\\x91\\\\xA9\\\\xE2\\\\x80\\\\x8D\\\\xF0\\\\x9F\\\\x91\\\\xA6 \\\\xF0"
"\\\\x9F\\\\x87\\\\xBA\\\\xF0\\\\x9F\\\\x87\\\\xB8 \\\\xE0\\\\xA4\\\\xA8\\\\xE0"
"\\\\xA5\\\\x80 \\\\xE0\\\\xAE\\\\xA8\\\\xE0\\\\xAE\\\\xBF!";
size_t ret, len, off;
printf("Input: \\\\"%s\\\\"\\\\n", s);
/* print each ${REALTYPE} with byte-length */
printf("${REALTYPE}s in NUL-delimited input:\\\\n");
for (off = 0; s[off] != '\\\\0'; off += ret) {
ret = grapheme_next_${TYPE}_break_utf8(s + off, SIZE_MAX);
printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off);
}
printf("\\\\n");
/* do the same, but this time string is length-delimited */
len = 17;
printf("${REALTYPE}s in input delimited to %zu bytes:\\\\n", len);
for (off = 0; off < len; off += ret) {
ret = grapheme_next_${TYPE}_break_utf8(s + off, len - off);
printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off);
}
return 0;
}
.Ed
EOF
fi
cat << EOF
.Sh SEE ALSO$(if [ "$TYPE" = "character" ] && [ "$ENCODING" != "utf8" ]; then printf "\n.Xr grapheme_is_character_break 3 ,"; fi)
.Xr grapheme_next_${TYPE}_break${ANTISUFFIX} 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_next_${TYPE}_break${SUFFIX}
is compliant with the Unicode ${UNICODE_VERSION} specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de
EOF

View File

@@ -0,0 +1,72 @@
if [ "$ENCODING" = "utf8" ]; then
UNIT="byte"
ARRAYTYPE="UTF-8-encoded string"
SUFFIX="_utf8"
ANTISUFFIX=""
DATATYPE="char"
else
UNIT="codepoint"
ARRAYTYPE="codepoint array"
SUFFIX=""
ANTISUFFIX="_utf8"
DATATYPE="uint_least32_t"
fi
cat << EOF
.Dd ${MAN_DATE}
.Dt GRAPHEME_TO_$(printf "%s%s" "$CASE" "$SUFFIX" | tr [:lower:] [:upper:]) 3
.Os suckless.org
.Sh NAME
.Nm grapheme_to_${CASE}${SUFFIX}
.Nd convert ${ARRAYTYPE} to ${CASE}
.Sh SYNOPSIS
.In grapheme.h
.Ft size_t
.Fn grapheme_to_${CASE}${SUFFIX} "const ${DATATYPE} *src" "size_t srclen" "${DATATYPE} *dest" "size_t destlen"
.Sh DESCRIPTION
The
.Fn grapheme_to_${CASE}${SUFFIX}
function converts the ${ARRAYTYPE}
.Va str
to ${CASE} and writes the result to
.Va dest
up to
.Va destlen ,
unless
.Va dest
is set to
.Dv NULL .
.Pp
If
.Va srclen
is set to
.Dv SIZE_MAX
(stdint.h is already included by grapheme.h) the ${ARRAYTYPE}
.Va src
is interpreted to be NUL-terminated and processing stops when a
NUL-byte is encountered.
.Pp
For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input data
.Xr grapheme_to_${CASE}${ANTISUFFIX} 3
can be used instead.
.Sh RETURN VALUES
The
.Fn grapheme_to_${CASE}${SUFFIX}
function returns the number of ${UNIT}s in the array resulting
from converting
.Va src
to ${CASE}, even if
.Va destlen
is not large enough or
.Va dest
is
.Dv NULL .
.Sh SEE ALSO
.Xr grapheme_to_${CASE}${ANTISUFFIX} 3 ,
.Xr libgrapheme 7
.Sh STANDARDS
.Fn grapheme_to_${CASE}${SUFFIX}
is compliant with the Unicode ${UNICODE_VERSION} specification.
.Sh AUTHORS
.An Laslo Hunhold Aq Mt dev@frign.de
EOF