Initial Commit
This commit is contained in:
100
libs/libgrapheme-2.0.2/man/grapheme_decode_utf8.3
Normal file
100
libs/libgrapheme-2.0.2/man/grapheme_decode_utf8.3
Normal file
@@ -0,0 +1,100 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_DECODE_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_decode_utf8
|
||||
.Nd decode first codepoint in UTF-8-encoded string
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_decode_utf8 "const char *str" "size_t len" "uint_least32_t *cp"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_decode_utf8
|
||||
function decodes the first codepoint in the UTF-8-encoded string
|
||||
.Va str
|
||||
of length
|
||||
.Va len .
|
||||
If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
|
||||
string ends unexpectedly, empty string, etc.) the decoding is stopped
|
||||
at the last processed byte and the decoded codepoint set to
|
||||
.Dv GRAPHEME_INVALID_CODEPOINT .
|
||||
.Pp
|
||||
If
|
||||
.Va cp
|
||||
is not
|
||||
.Dv NULL
|
||||
the decoded codepoint is stored in the memory pointed to by
|
||||
.Va cp .
|
||||
.Pp
|
||||
Given NUL has a unique 1 byte representation, it is safe to operate on
|
||||
NUL-terminated strings by setting
|
||||
.Va len
|
||||
to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) and terminating when
|
||||
.Va cp
|
||||
is 0 (see
|
||||
.Sx EXAMPLES
|
||||
for an example).
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_decode_utf8
|
||||
function returns the number of processed bytes and 0 if
|
||||
.Va str
|
||||
is
|
||||
.Dv NULL
|
||||
or
|
||||
.Va len
|
||||
is 0.
|
||||
If the string ends unexpectedly in a multibyte sequence, the desired
|
||||
length (that is larger than
|
||||
.Va len )
|
||||
is returned.
|
||||
.Sh EXAMPLES
|
||||
.Bd -literal
|
||||
/* cc (-static) -o example example.c -lgrapheme */
|
||||
#include <grapheme.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
|
||||
void
|
||||
print_cps(const char *str, size_t len)
|
||||
{
|
||||
size_t ret, off;
|
||||
uint_least32_t cp;
|
||||
|
||||
for (off = 0; off < len; off += ret) {
|
||||
if ((ret = grapheme_decode_utf8(str + off,
|
||||
len - off, &cp)) > (len - off)) {
|
||||
/*
|
||||
* string ended unexpectedly in the middle of a
|
||||
* multibyte sequence and we have the choice
|
||||
* here to possibly expand str by ret - len + off
|
||||
* bytes to get a full sequence, but we just
|
||||
* bail out in this case.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
printf("%"PRIxLEAST32"\\n", cp);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
print_cps_nul_terminated(const char *str)
|
||||
{
|
||||
size_t ret, off;
|
||||
uint_least32_t cp;
|
||||
|
||||
for (off = 0; (ret = grapheme_decode_utf8(str + off,
|
||||
SIZE_MAX, &cp)) > 0 &&
|
||||
cp != 0; off += ret) {
|
||||
printf("%"PRIxLEAST32"\\n", cp);
|
||||
}
|
||||
}
|
||||
.Ed
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_encode_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
102
libs/libgrapheme-2.0.2/man/grapheme_decode_utf8.sh
Normal file
102
libs/libgrapheme-2.0.2/man/grapheme_decode_utf8.sh
Normal file
@@ -0,0 +1,102 @@
|
||||
cat << EOF
|
||||
.Dd ${MAN_DATE}
|
||||
.Dt GRAPHEME_DECODE_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_decode_utf8
|
||||
.Nd decode first codepoint in UTF-8-encoded string
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_decode_utf8 "const char *str" "size_t len" "uint_least32_t *cp"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_decode_utf8
|
||||
function decodes the first codepoint in the UTF-8-encoded string
|
||||
.Va str
|
||||
of length
|
||||
.Va len .
|
||||
If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
|
||||
string ends unexpectedly, empty string, etc.) the decoding is stopped
|
||||
at the last processed byte and the decoded codepoint set to
|
||||
.Dv GRAPHEME_INVALID_CODEPOINT .
|
||||
.Pp
|
||||
If
|
||||
.Va cp
|
||||
is not
|
||||
.Dv NULL
|
||||
the decoded codepoint is stored in the memory pointed to by
|
||||
.Va cp .
|
||||
.Pp
|
||||
Given NUL has a unique 1 byte representation, it is safe to operate on
|
||||
NUL-terminated strings by setting
|
||||
.Va len
|
||||
to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) and terminating when
|
||||
.Va cp
|
||||
is 0 (see
|
||||
.Sx EXAMPLES
|
||||
for an example).
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_decode_utf8
|
||||
function returns the number of processed bytes and 0 if
|
||||
.Va str
|
||||
is
|
||||
.Dv NULL
|
||||
or
|
||||
.Va len
|
||||
is 0.
|
||||
If the string ends unexpectedly in a multibyte sequence, the desired
|
||||
length (that is larger than
|
||||
.Va len )
|
||||
is returned.
|
||||
.Sh EXAMPLES
|
||||
.Bd -literal
|
||||
/* cc (-static) -o example example.c -lgrapheme */
|
||||
#include <grapheme.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
|
||||
void
|
||||
print_cps(const char *str, size_t len)
|
||||
{
|
||||
size_t ret, off;
|
||||
uint_least32_t cp;
|
||||
|
||||
for (off = 0; off < len; off += ret) {
|
||||
if ((ret = grapheme_decode_utf8(str + off,
|
||||
len - off, &cp)) > (len - off)) {
|
||||
/*
|
||||
* string ended unexpectedly in the middle of a
|
||||
* multibyte sequence and we have the choice
|
||||
* here to possibly expand str by ret - len + off
|
||||
* bytes to get a full sequence, but we just
|
||||
* bail out in this case.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
printf("%"PRIxLEAST32"\\\\n", cp);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
print_cps_nul_terminated(const char *str)
|
||||
{
|
||||
size_t ret, off;
|
||||
uint_least32_t cp;
|
||||
|
||||
for (off = 0; (ret = grapheme_decode_utf8(str + off,
|
||||
SIZE_MAX, &cp)) > 0 &&
|
||||
cp != 0; off += ret) {
|
||||
printf("%"PRIxLEAST32"\\\\n", cp);
|
||||
}
|
||||
}
|
||||
.Ed
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_encode_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
||||
EOF
|
101
libs/libgrapheme-2.0.2/man/grapheme_encode_utf8.3
Normal file
101
libs/libgrapheme-2.0.2/man/grapheme_encode_utf8.3
Normal file
@@ -0,0 +1,101 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_ENCODE_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_encode_utf8
|
||||
.Nd encode codepoint into UTF-8 string
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_encode_utf8 "uint_least32_t cp" "char *str" "size_t len"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_encode_utf8
|
||||
function encodes the codepoint
|
||||
.Va cp
|
||||
into a UTF-8-string.
|
||||
If
|
||||
.Va str
|
||||
is not
|
||||
.Dv NULL
|
||||
and
|
||||
.Va len
|
||||
is large enough it writes the UTF-8-string to the memory pointed to by
|
||||
.Va str .
|
||||
Otherwise no data is written.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_encode_utf8
|
||||
function returns the length (in bytes) of the UTF-8-string resulting
|
||||
from encoding
|
||||
.Va cp ,
|
||||
even if
|
||||
.Va len
|
||||
is not large enough or
|
||||
.Va str
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh EXAMPLES
|
||||
.Bd -literal
|
||||
/* cc (-static) -o example example.c -lgrapheme */
|
||||
#include <grapheme.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
size_t
|
||||
cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len)
|
||||
{
|
||||
size_t i, off, ret;
|
||||
|
||||
for (i = 0, off = 0; i < cplen; i++, off += ret) {
|
||||
if ((ret = grapheme_encode_utf8(cp[i], str + off,
|
||||
len - off)) > (len - off)) {
|
||||
/* buffer too small */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return off;
|
||||
}
|
||||
|
||||
size_t
|
||||
cps_bytelen(const uint_least32_t *cp, size_t cplen)
|
||||
{
|
||||
size_t i, len;
|
||||
|
||||
for (i = 0, len = 0; i < cplen; i++) {
|
||||
len += grapheme_encode_utf8(cp[i], NULL, 0);
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
char *
|
||||
cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen)
|
||||
{
|
||||
char *str;
|
||||
size_t len, i, ret, off;
|
||||
|
||||
len = cps_bytelen(cp, cplen);
|
||||
|
||||
if (!(str = malloc(len))) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (i = 0, off = 0; i < cplen; i++, off += ret) {
|
||||
if ((ret = grapheme_encode_utf8(cp[i], str + off,
|
||||
len - off)) > (len - off)) {
|
||||
/* buffer too small */
|
||||
break;
|
||||
}
|
||||
}
|
||||
str[off] = '\\0';
|
||||
|
||||
return str;
|
||||
}
|
||||
.Ed
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_decode_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
103
libs/libgrapheme-2.0.2/man/grapheme_encode_utf8.sh
Normal file
103
libs/libgrapheme-2.0.2/man/grapheme_encode_utf8.sh
Normal file
@@ -0,0 +1,103 @@
|
||||
cat << EOF
|
||||
.Dd ${MAN_DATE}
|
||||
.Dt GRAPHEME_ENCODE_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_encode_utf8
|
||||
.Nd encode codepoint into UTF-8 string
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_encode_utf8 "uint_least32_t cp" "char *str" "size_t len"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_encode_utf8
|
||||
function encodes the codepoint
|
||||
.Va cp
|
||||
into a UTF-8-string.
|
||||
If
|
||||
.Va str
|
||||
is not
|
||||
.Dv NULL
|
||||
and
|
||||
.Va len
|
||||
is large enough it writes the UTF-8-string to the memory pointed to by
|
||||
.Va str .
|
||||
Otherwise no data is written.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_encode_utf8
|
||||
function returns the length (in bytes) of the UTF-8-string resulting
|
||||
from encoding
|
||||
.Va cp ,
|
||||
even if
|
||||
.Va len
|
||||
is not large enough or
|
||||
.Va str
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh EXAMPLES
|
||||
.Bd -literal
|
||||
/* cc (-static) -o example example.c -lgrapheme */
|
||||
#include <grapheme.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
size_t
|
||||
cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len)
|
||||
{
|
||||
size_t i, off, ret;
|
||||
|
||||
for (i = 0, off = 0; i < cplen; i++, off += ret) {
|
||||
if ((ret = grapheme_encode_utf8(cp[i], str + off,
|
||||
len - off)) > (len - off)) {
|
||||
/* buffer too small */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return off;
|
||||
}
|
||||
|
||||
size_t
|
||||
cps_bytelen(const uint_least32_t *cp, size_t cplen)
|
||||
{
|
||||
size_t i, len;
|
||||
|
||||
for (i = 0, len = 0; i < cplen; i++) {
|
||||
len += grapheme_encode_utf8(cp[i], NULL, 0);
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
char *
|
||||
cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen)
|
||||
{
|
||||
char *str;
|
||||
size_t len, i, ret, off;
|
||||
|
||||
len = cps_bytelen(cp, cplen);
|
||||
|
||||
if (!(str = malloc(len))) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (i = 0, off = 0; i < cplen; i++, off += ret) {
|
||||
if ((ret = grapheme_encode_utf8(cp[i], str + off,
|
||||
len - off)) > (len - off)) {
|
||||
/* buffer too small */
|
||||
break;
|
||||
}
|
||||
}
|
||||
str[off] = '\\\\0';
|
||||
|
||||
return str;
|
||||
}
|
||||
.Ed
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_decode_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
||||
EOF
|
81
libs/libgrapheme-2.0.2/man/grapheme_is_character_break.3
Normal file
81
libs/libgrapheme-2.0.2/man/grapheme_is_character_break.3
Normal file
@@ -0,0 +1,81 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_IS_CHARACTER_BREAK 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_is_character_break
|
||||
.Nd test for a grapheme cluster break between two codepoints
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "uint_least16_t *state"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_is_character_break
|
||||
function determines if there is a grapheme cluster break (see
|
||||
.Xr libgrapheme 7 )
|
||||
between the two codepoints
|
||||
.Va cp1
|
||||
and
|
||||
.Va cp2 .
|
||||
By specification this decision depends on a
|
||||
.Va state
|
||||
that can at most be completely reset after detecting a break and must
|
||||
be reset every time one deviates from sequential processing.
|
||||
.Pp
|
||||
If
|
||||
.Va state
|
||||
is
|
||||
.Dv NULL
|
||||
.Fn grapheme_is_character_break
|
||||
behaves as if it was called with a fully reset state.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_is_character_break
|
||||
function returns
|
||||
.Va true
|
||||
if there is a grapheme cluster break between the codepoints
|
||||
.Va cp1
|
||||
and
|
||||
.Va cp2
|
||||
and
|
||||
.Va false
|
||||
if there is not.
|
||||
.Sh EXAMPLES
|
||||
.Bd -literal
|
||||
/* cc (-static) -o example example.c -lgrapheme */
|
||||
#include <grapheme.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
uint_least16_t state = 0;
|
||||
uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i + 1 < sizeof(s1) / sizeof(*s1); i++) {
|
||||
if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
|
||||
printf("break in s1 at offset %zu\n", i);
|
||||
}
|
||||
}
|
||||
memset(&state, 0, sizeof(state)); /* reset state */
|
||||
for (i = 0; i + 1 < sizeof(s2) / sizeof(*s2); i++) {
|
||||
if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
|
||||
printf("break in s2 at offset %zu\n", i);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
.Ed
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_next_character_break 3 ,
|
||||
.Xr grapheme_next_character_break_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_is_character_break
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
83
libs/libgrapheme-2.0.2/man/grapheme_is_character_break.sh
Normal file
83
libs/libgrapheme-2.0.2/man/grapheme_is_character_break.sh
Normal file
@@ -0,0 +1,83 @@
|
||||
cat << EOF
|
||||
.Dd ${MAN_DATE}
|
||||
.Dt GRAPHEME_IS_CHARACTER_BREAK 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_is_character_break
|
||||
.Nd test for a grapheme cluster break between two codepoints
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "uint_least16_t *state"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_is_character_break
|
||||
function determines if there is a grapheme cluster break (see
|
||||
.Xr libgrapheme 7 )
|
||||
between the two codepoints
|
||||
.Va cp1
|
||||
and
|
||||
.Va cp2 .
|
||||
By specification this decision depends on a
|
||||
.Va state
|
||||
that can at most be completely reset after detecting a break and must
|
||||
be reset every time one deviates from sequential processing.
|
||||
.Pp
|
||||
If
|
||||
.Va state
|
||||
is
|
||||
.Dv NULL
|
||||
.Fn grapheme_is_character_break
|
||||
behaves as if it was called with a fully reset state.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_is_character_break
|
||||
function returns
|
||||
.Va true
|
||||
if there is a grapheme cluster break between the codepoints
|
||||
.Va cp1
|
||||
and
|
||||
.Va cp2
|
||||
and
|
||||
.Va false
|
||||
if there is not.
|
||||
.Sh EXAMPLES
|
||||
.Bd -literal
|
||||
/* cc (-static) -o example example.c -lgrapheme */
|
||||
#include <grapheme.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
uint_least16_t state = 0;
|
||||
uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i + 1 < sizeof(s1) / sizeof(*s1); i++) {
|
||||
if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
|
||||
printf("break in s1 at offset %zu\n", i);
|
||||
}
|
||||
}
|
||||
memset(&state, 0, sizeof(state)); /* reset state */
|
||||
for (i = 0; i + 1 < sizeof(s2) / sizeof(*s2); i++) {
|
||||
if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
|
||||
printf("break in s2 at offset %zu\n", i);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
.Ed
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_next_character_break 3 ,
|
||||
.Xr grapheme_next_character_break_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_is_character_break
|
||||
is compliant with the Unicode ${UNICODE_VERSION} specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
||||
EOF
|
51
libs/libgrapheme-2.0.2/man/grapheme_is_lowercase.3
Normal file
51
libs/libgrapheme-2.0.2/man/grapheme_is_lowercase.3
Normal file
@@ -0,0 +1,51 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_IS_LOWERCASE 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_is_lowercase
|
||||
.Nd check if codepoint array is lowercase
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_is_lowercase "const uint_least32_t *str" "size_t len" "size_t *caselen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_is_lowercase
|
||||
function checks if the codepoint array
|
||||
.Va str
|
||||
is lowercase and writes the length of the matching lowercase-sequence to the integer pointed to by
|
||||
.Va caselen ,
|
||||
unless
|
||||
.Va caselen
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the codepoint array
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For UTF-8-encoded input data
|
||||
.Xr grapheme_is_lowercase_utf8 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_is_lowercase
|
||||
function returns
|
||||
.Dv true
|
||||
if the codepoint array
|
||||
.Va str
|
||||
is lowercase, otherwise
|
||||
.Dv false .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_is_lowercase_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_is_lowercase
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
3
libs/libgrapheme-2.0.2/man/grapheme_is_lowercase.sh
Normal file
3
libs/libgrapheme-2.0.2/man/grapheme_is_lowercase.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
ENCODING="codepoint" \
|
||||
CASE="lowercase" \
|
||||
$SH man/template/is_case.sh
|
51
libs/libgrapheme-2.0.2/man/grapheme_is_lowercase_utf8.3
Normal file
51
libs/libgrapheme-2.0.2/man/grapheme_is_lowercase_utf8.3
Normal file
@@ -0,0 +1,51 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_IS_LOWERCASE_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_is_lowercase_utf8
|
||||
.Nd check if UTF-8-encoded string is lowercase
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_is_lowercase_utf8 "const char *str" "size_t len" "size_t *caselen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_is_lowercase_utf8
|
||||
function checks if the UTF-8-encoded string
|
||||
.Va str
|
||||
is lowercase and writes the length of the matching lowercase-sequence to the integer pointed to by
|
||||
.Va caselen ,
|
||||
unless
|
||||
.Va caselen
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the UTF-8-encoded string
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For non-UTF-8 input data
|
||||
.Xr grapheme_is_lowercase 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_is_lowercase_utf8
|
||||
function returns
|
||||
.Dv true
|
||||
if the UTF-8-encoded string
|
||||
.Va str
|
||||
is lowercase, otherwise
|
||||
.Dv false .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_is_lowercase 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_is_lowercase_utf8
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
3
libs/libgrapheme-2.0.2/man/grapheme_is_lowercase_utf8.sh
Normal file
3
libs/libgrapheme-2.0.2/man/grapheme_is_lowercase_utf8.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
ENCODING="utf8" \
|
||||
CASE="lowercase" \
|
||||
$SH man/template/is_case.sh
|
51
libs/libgrapheme-2.0.2/man/grapheme_is_titlecase.3
Normal file
51
libs/libgrapheme-2.0.2/man/grapheme_is_titlecase.3
Normal file
@@ -0,0 +1,51 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_IS_TITLECASE 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_is_titlecase
|
||||
.Nd check if codepoint array is titlecase
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_is_titlecase "const uint_least32_t *str" "size_t len" "size_t *caselen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_is_titlecase
|
||||
function checks if the codepoint array
|
||||
.Va str
|
||||
is titlecase and writes the length of the matching titlecase-sequence to the integer pointed to by
|
||||
.Va caselen ,
|
||||
unless
|
||||
.Va caselen
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the codepoint array
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For UTF-8-encoded input data
|
||||
.Xr grapheme_is_titlecase_utf8 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_is_titlecase
|
||||
function returns
|
||||
.Dv true
|
||||
if the codepoint array
|
||||
.Va str
|
||||
is titlecase, otherwise
|
||||
.Dv false .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_is_titlecase_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_is_titlecase
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
3
libs/libgrapheme-2.0.2/man/grapheme_is_titlecase.sh
Normal file
3
libs/libgrapheme-2.0.2/man/grapheme_is_titlecase.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
ENCODING="codepoint" \
|
||||
CASE="titlecase" \
|
||||
$SH man/template/is_case.sh
|
51
libs/libgrapheme-2.0.2/man/grapheme_is_titlecase_utf8.3
Normal file
51
libs/libgrapheme-2.0.2/man/grapheme_is_titlecase_utf8.3
Normal file
@@ -0,0 +1,51 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_IS_TITLECASE_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_is_titlecase_utf8
|
||||
.Nd check if UTF-8-encoded string is titlecase
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_is_titlecase_utf8 "const char *str" "size_t len" "size_t *caselen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_is_titlecase_utf8
|
||||
function checks if the UTF-8-encoded string
|
||||
.Va str
|
||||
is titlecase and writes the length of the matching titlecase-sequence to the integer pointed to by
|
||||
.Va caselen ,
|
||||
unless
|
||||
.Va caselen
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the UTF-8-encoded string
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For non-UTF-8 input data
|
||||
.Xr grapheme_is_titlecase 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_is_titlecase_utf8
|
||||
function returns
|
||||
.Dv true
|
||||
if the UTF-8-encoded string
|
||||
.Va str
|
||||
is titlecase, otherwise
|
||||
.Dv false .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_is_titlecase 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_is_titlecase_utf8
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
3
libs/libgrapheme-2.0.2/man/grapheme_is_titlecase_utf8.sh
Normal file
3
libs/libgrapheme-2.0.2/man/grapheme_is_titlecase_utf8.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
ENCODING="utf8" \
|
||||
CASE="titlecase" \
|
||||
$SH man/template/is_case.sh
|
51
libs/libgrapheme-2.0.2/man/grapheme_is_uppercase.3
Normal file
51
libs/libgrapheme-2.0.2/man/grapheme_is_uppercase.3
Normal file
@@ -0,0 +1,51 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_IS_UPPERCASE 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_is_uppercase
|
||||
.Nd check if codepoint array is uppercase
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_is_uppercase "const uint_least32_t *str" "size_t len" "size_t *caselen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_is_uppercase
|
||||
function checks if the codepoint array
|
||||
.Va str
|
||||
is uppercase and writes the length of the matching uppercase-sequence to the integer pointed to by
|
||||
.Va caselen ,
|
||||
unless
|
||||
.Va caselen
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the codepoint array
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For UTF-8-encoded input data
|
||||
.Xr grapheme_is_uppercase_utf8 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_is_uppercase
|
||||
function returns
|
||||
.Dv true
|
||||
if the codepoint array
|
||||
.Va str
|
||||
is uppercase, otherwise
|
||||
.Dv false .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_is_uppercase_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_is_uppercase
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
3
libs/libgrapheme-2.0.2/man/grapheme_is_uppercase.sh
Normal file
3
libs/libgrapheme-2.0.2/man/grapheme_is_uppercase.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
ENCODING="codepoint" \
|
||||
CASE="uppercase" \
|
||||
$SH man/template/is_case.sh
|
51
libs/libgrapheme-2.0.2/man/grapheme_is_uppercase_utf8.3
Normal file
51
libs/libgrapheme-2.0.2/man/grapheme_is_uppercase_utf8.3
Normal file
@@ -0,0 +1,51 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_IS_LOWERCASE_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_is_lowercase_utf8
|
||||
.Nd check if UTF-8-encoded string is lowercase
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_is_lowercase_utf8 "const char *str" "size_t len" "size_t *caselen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_is_lowercase_utf8
|
||||
function checks if the UTF-8-encoded string
|
||||
.Va str
|
||||
is lowercase and writes the length of the matching lowercase-sequence to the integer pointed to by
|
||||
.Va caselen ,
|
||||
unless
|
||||
.Va caselen
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the UTF-8-encoded string
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For non-UTF-8 input data
|
||||
.Xr grapheme_is_lowercase 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_is_lowercase_utf8
|
||||
function returns
|
||||
.Dv true
|
||||
if the UTF-8-encoded string
|
||||
.Va str
|
||||
is lowercase, otherwise
|
||||
.Dv false .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_is_lowercase 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_is_lowercase_utf8
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
3
libs/libgrapheme-2.0.2/man/grapheme_is_uppercase_utf8.sh
Normal file
3
libs/libgrapheme-2.0.2/man/grapheme_is_uppercase_utf8.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
ENCODING="utf8" \
|
||||
CASE="lowercase" \
|
||||
$SH man/template/is_case.sh
|
56
libs/libgrapheme-2.0.2/man/grapheme_next_character_break.3
Normal file
56
libs/libgrapheme-2.0.2/man/grapheme_next_character_break.3
Normal file
@@ -0,0 +1,56 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_NEXT_CHARACTER_BREAK 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_next_character_break
|
||||
.Nd determine codepoint-offset to next grapheme cluster break
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_next_character_break "const uint_least32_t *str" "size_t len"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_next_character_break
|
||||
function computes the offset (in codepoints) to the next grapheme cluster
|
||||
break (see
|
||||
.Xr libgrapheme 7 )
|
||||
in the codepoint array
|
||||
.Va str
|
||||
of length
|
||||
.Va len .
|
||||
If a grapheme cluster begins at
|
||||
.Va str
|
||||
this offset is equal to the length of said grapheme cluster.
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the string
|
||||
.Va str
|
||||
is interpreted to be NUL-terminated and processing stops when
|
||||
a codepoint with the value 0 is encountered.
|
||||
.Pp
|
||||
For UTF-8-encoded input
|
||||
data
|
||||
.Xr grapheme_next_character_break_utf8 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_next_character_break
|
||||
function returns the offset (in codepoints) to the next grapheme cluster
|
||||
break in
|
||||
.Va str
|
||||
or 0 if
|
||||
.Va str
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_is_character_break 3 ,
|
||||
.Xr grapheme_next_character_break_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_next_character_break
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
@@ -0,0 +1,4 @@
|
||||
ENCODING="codepoint" \
|
||||
TYPE="character" \
|
||||
REALTYPE="grapheme cluster" \
|
||||
$SH man/template/next_break.sh
|
@@ -0,0 +1,94 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_NEXT_CHARACTER_BREAK_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_next_character_break_utf8
|
||||
.Nd determine byte-offset to next grapheme cluster break
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_next_character_break_utf8 "const char *str" "size_t len"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_next_character_break_utf8
|
||||
function computes the offset (in bytes) to the next grapheme cluster
|
||||
break (see
|
||||
.Xr libgrapheme 7 )
|
||||
in the UTF-8-encoded string
|
||||
.Va str
|
||||
of length
|
||||
.Va len .
|
||||
If a grapheme cluster begins at
|
||||
.Va str
|
||||
this offset is equal to the length of said grapheme cluster.
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the string
|
||||
.Va str
|
||||
is interpreted to be NUL-terminated and processing stops when
|
||||
a NUL-byte is encountered.
|
||||
.Pp
|
||||
For non-UTF-8 input
|
||||
data
|
||||
.Xr grapheme_is_character_break 3 and
|
||||
.Xr grapheme_next_character_break 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_next_character_break_utf8
|
||||
function returns the offset (in bytes) to the next grapheme cluster
|
||||
break in
|
||||
.Va str
|
||||
or 0 if
|
||||
.Va str
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh EXAMPLES
|
||||
.Bd -literal
|
||||
/* cc (-static) -o example example.c -lgrapheme */
|
||||
#include <grapheme.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
/* UTF-8 encoded input */
|
||||
char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
|
||||
"\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
|
||||
"\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
|
||||
"\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
|
||||
size_t ret, len, off;
|
||||
|
||||
printf("Input: \\"%s\\"\\n", s);
|
||||
|
||||
/* print each grapheme cluster with byte-length */
|
||||
printf("grapheme clusters in NUL-delimited input:\\n");
|
||||
for (off = 0; s[off] != '\\0'; off += ret) {
|
||||
ret = grapheme_next_character_break_utf8(s + off, SIZE_MAX);
|
||||
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
|
||||
}
|
||||
printf("\\n");
|
||||
|
||||
/* do the same, but this time string is length-delimited */
|
||||
len = 17;
|
||||
printf("grapheme clusters in input delimited to %zu bytes:\\n", len);
|
||||
for (off = 0; off < len; off += ret) {
|
||||
ret = grapheme_next_character_break_utf8(s + off, len - off);
|
||||
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
.Ed
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_next_character_break 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_next_character_break_utf8
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
@@ -0,0 +1,4 @@
|
||||
ENCODING="utf8" \
|
||||
TYPE="character" \
|
||||
REALTYPE="grapheme cluster" \
|
||||
$SH man/template/next_break.sh
|
52
libs/libgrapheme-2.0.2/man/grapheme_next_line_break.3
Normal file
52
libs/libgrapheme-2.0.2/man/grapheme_next_line_break.3
Normal file
@@ -0,0 +1,52 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_NEXT_LINE_BREAK 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_next_line_break
|
||||
.Nd determine codepoint-offset to next possible line break
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_next_line_break "const uint_least32_t *str" "size_t len"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_next_line_break
|
||||
function computes the offset (in codepoints) to the next possible line
|
||||
break (see
|
||||
.Xr libgrapheme 7 )
|
||||
in the codepoint array
|
||||
.Va str
|
||||
of length
|
||||
.Va len .
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the string
|
||||
.Va str
|
||||
is interpreted to be NUL-terminated and processing stops when
|
||||
a codepoint with the value 0 is encountered.
|
||||
.Pp
|
||||
For UTF-8-encoded input
|
||||
data
|
||||
.Xr grapheme_next_line_break_utf8 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_next_line_break
|
||||
function returns the offset (in codepoints) to the next possible line
|
||||
break in
|
||||
.Va str
|
||||
or 0 if
|
||||
.Va str
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_next_line_break_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_next_line_break
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
4
libs/libgrapheme-2.0.2/man/grapheme_next_line_break.sh
Normal file
4
libs/libgrapheme-2.0.2/man/grapheme_next_line_break.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
ENCODING="codepoint" \
|
||||
TYPE="line" \
|
||||
REALTYPE="possible line" \
|
||||
$SH man/template/next_break.sh
|
90
libs/libgrapheme-2.0.2/man/grapheme_next_line_break_utf8.3
Normal file
90
libs/libgrapheme-2.0.2/man/grapheme_next_line_break_utf8.3
Normal file
@@ -0,0 +1,90 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_NEXT_LINE_BREAK_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_next_line_break_utf8
|
||||
.Nd determine byte-offset to next possible line break
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_next_line_break_utf8 "const char *str" "size_t len"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_next_line_break_utf8
|
||||
function computes the offset (in bytes) to the next possible line
|
||||
break (see
|
||||
.Xr libgrapheme 7 )
|
||||
in the UTF-8-encoded string
|
||||
.Va str
|
||||
of length
|
||||
.Va len .
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the string
|
||||
.Va str
|
||||
is interpreted to be NUL-terminated and processing stops when
|
||||
a NUL-byte is encountered.
|
||||
.Pp
|
||||
For non-UTF-8 input
|
||||
data
|
||||
.Xr grapheme_next_line_break 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_next_line_break_utf8
|
||||
function returns the offset (in bytes) to the next possible line
|
||||
break in
|
||||
.Va str
|
||||
or 0 if
|
||||
.Va str
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh EXAMPLES
|
||||
.Bd -literal
|
||||
/* cc (-static) -o example example.c -lgrapheme */
|
||||
#include <grapheme.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
/* UTF-8 encoded input */
|
||||
char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
|
||||
"\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
|
||||
"\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
|
||||
"\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
|
||||
size_t ret, len, off;
|
||||
|
||||
printf("Input: \\"%s\\"\\n", s);
|
||||
|
||||
/* print each possible line with byte-length */
|
||||
printf("possible lines in NUL-delimited input:\\n");
|
||||
for (off = 0; s[off] != '\\0'; off += ret) {
|
||||
ret = grapheme_next_line_break_utf8(s + off, SIZE_MAX);
|
||||
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
|
||||
}
|
||||
printf("\\n");
|
||||
|
||||
/* do the same, but this time string is length-delimited */
|
||||
len = 17;
|
||||
printf("possible lines in input delimited to %zu bytes:\\n", len);
|
||||
for (off = 0; off < len; off += ret) {
|
||||
ret = grapheme_next_line_break_utf8(s + off, len - off);
|
||||
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
.Ed
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_next_line_break 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_next_line_break_utf8
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
@@ -0,0 +1,4 @@
|
||||
ENCODING="utf8" \
|
||||
TYPE="line" \
|
||||
REALTYPE="possible line" \
|
||||
$SH man/template/next_break.sh
|
55
libs/libgrapheme-2.0.2/man/grapheme_next_sentence_break.3
Normal file
55
libs/libgrapheme-2.0.2/man/grapheme_next_sentence_break.3
Normal file
@@ -0,0 +1,55 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_NEXT_SENTENCE_BREAK 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_next_sentence_break
|
||||
.Nd determine codepoint-offset to next sentence break
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_next_sentence_break "const uint_least32_t *str" "size_t len"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_next_sentence_break
|
||||
function computes the offset (in codepoints) to the next sentence
|
||||
break (see
|
||||
.Xr libgrapheme 7 )
|
||||
in the codepoint array
|
||||
.Va str
|
||||
of length
|
||||
.Va len .
|
||||
If a sentence begins at
|
||||
.Va str
|
||||
this offset is equal to the length of said sentence.
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the string
|
||||
.Va str
|
||||
is interpreted to be NUL-terminated and processing stops when
|
||||
a codepoint with the value 0 is encountered.
|
||||
.Pp
|
||||
For UTF-8-encoded input
|
||||
data
|
||||
.Xr grapheme_next_sentence_break_utf8 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_next_sentence_break
|
||||
function returns the offset (in codepoints) to the next sentence
|
||||
break in
|
||||
.Va str
|
||||
or 0 if
|
||||
.Va str
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_next_sentence_break_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_next_sentence_break
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
@@ -0,0 +1,4 @@
|
||||
ENCODING="codepoint" \
|
||||
TYPE="sentence" \
|
||||
REALTYPE="sentence" \
|
||||
$SH man/template/next_break.sh
|
@@ -0,0 +1,93 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_NEXT_SENTENCE_BREAK_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_next_sentence_break_utf8
|
||||
.Nd determine byte-offset to next sentence break
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_next_sentence_break_utf8 "const char *str" "size_t len"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_next_sentence_break_utf8
|
||||
function computes the offset (in bytes) to the next sentence
|
||||
break (see
|
||||
.Xr libgrapheme 7 )
|
||||
in the UTF-8-encoded string
|
||||
.Va str
|
||||
of length
|
||||
.Va len .
|
||||
If a sentence begins at
|
||||
.Va str
|
||||
this offset is equal to the length of said sentence.
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the string
|
||||
.Va str
|
||||
is interpreted to be NUL-terminated and processing stops when
|
||||
a NUL-byte is encountered.
|
||||
.Pp
|
||||
For non-UTF-8 input
|
||||
data
|
||||
.Xr grapheme_next_sentence_break 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_next_sentence_break_utf8
|
||||
function returns the offset (in bytes) to the next sentence
|
||||
break in
|
||||
.Va str
|
||||
or 0 if
|
||||
.Va str
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh EXAMPLES
|
||||
.Bd -literal
|
||||
/* cc (-static) -o example example.c -lgrapheme */
|
||||
#include <grapheme.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
/* UTF-8 encoded input */
|
||||
char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
|
||||
"\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
|
||||
"\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
|
||||
"\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
|
||||
size_t ret, len, off;
|
||||
|
||||
printf("Input: \\"%s\\"\\n", s);
|
||||
|
||||
/* print each sentence with byte-length */
|
||||
printf("sentences in NUL-delimited input:\\n");
|
||||
for (off = 0; s[off] != '\\0'; off += ret) {
|
||||
ret = grapheme_next_sentence_break_utf8(s + off, SIZE_MAX);
|
||||
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
|
||||
}
|
||||
printf("\\n");
|
||||
|
||||
/* do the same, but this time string is length-delimited */
|
||||
len = 17;
|
||||
printf("sentences in input delimited to %zu bytes:\\n", len);
|
||||
for (off = 0; off < len; off += ret) {
|
||||
ret = grapheme_next_sentence_break_utf8(s + off, len - off);
|
||||
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
.Ed
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_next_sentence_break 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_next_sentence_break_utf8
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
@@ -0,0 +1,4 @@
|
||||
ENCODING="utf8" \
|
||||
TYPE="sentence" \
|
||||
REALTYPE="sentence" \
|
||||
$SH man/template/next_break.sh
|
55
libs/libgrapheme-2.0.2/man/grapheme_next_word_break.3
Normal file
55
libs/libgrapheme-2.0.2/man/grapheme_next_word_break.3
Normal file
@@ -0,0 +1,55 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_NEXT_WORD_BREAK 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_next_word_break
|
||||
.Nd determine codepoint-offset to next word break
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_next_word_break "const uint_least32_t *str" "size_t len"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_next_word_break
|
||||
function computes the offset (in codepoints) to the next word
|
||||
break (see
|
||||
.Xr libgrapheme 7 )
|
||||
in the codepoint array
|
||||
.Va str
|
||||
of length
|
||||
.Va len .
|
||||
If a word begins at
|
||||
.Va str
|
||||
this offset is equal to the length of said word.
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the string
|
||||
.Va str
|
||||
is interpreted to be NUL-terminated and processing stops when
|
||||
a codepoint with the value 0 is encountered.
|
||||
.Pp
|
||||
For UTF-8-encoded input
|
||||
data
|
||||
.Xr grapheme_next_word_break_utf8 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_next_word_break
|
||||
function returns the offset (in codepoints) to the next word
|
||||
break in
|
||||
.Va str
|
||||
or 0 if
|
||||
.Va str
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_next_word_break_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_next_word_break
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
4
libs/libgrapheme-2.0.2/man/grapheme_next_word_break.sh
Normal file
4
libs/libgrapheme-2.0.2/man/grapheme_next_word_break.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
ENCODING="codepoint" \
|
||||
TYPE="word" \
|
||||
REALTYPE="word" \
|
||||
$SH man/template/next_break.sh
|
93
libs/libgrapheme-2.0.2/man/grapheme_next_word_break_utf8.3
Normal file
93
libs/libgrapheme-2.0.2/man/grapheme_next_word_break_utf8.3
Normal file
@@ -0,0 +1,93 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_NEXT_WORD_BREAK_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_next_word_break_utf8
|
||||
.Nd determine byte-offset to next word break
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_next_word_break_utf8 "const char *str" "size_t len"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_next_word_break_utf8
|
||||
function computes the offset (in bytes) to the next word
|
||||
break (see
|
||||
.Xr libgrapheme 7 )
|
||||
in the UTF-8-encoded string
|
||||
.Va str
|
||||
of length
|
||||
.Va len .
|
||||
If a word begins at
|
||||
.Va str
|
||||
this offset is equal to the length of said word.
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the string
|
||||
.Va str
|
||||
is interpreted to be NUL-terminated and processing stops when
|
||||
a NUL-byte is encountered.
|
||||
.Pp
|
||||
For non-UTF-8 input
|
||||
data
|
||||
.Xr grapheme_next_word_break 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_next_word_break_utf8
|
||||
function returns the offset (in bytes) to the next word
|
||||
break in
|
||||
.Va str
|
||||
or 0 if
|
||||
.Va str
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh EXAMPLES
|
||||
.Bd -literal
|
||||
/* cc (-static) -o example example.c -lgrapheme */
|
||||
#include <grapheme.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
/* UTF-8 encoded input */
|
||||
char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
|
||||
"\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
|
||||
"\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
|
||||
"\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
|
||||
size_t ret, len, off;
|
||||
|
||||
printf("Input: \\"%s\\"\\n", s);
|
||||
|
||||
/* print each word with byte-length */
|
||||
printf("words in NUL-delimited input:\\n");
|
||||
for (off = 0; s[off] != '\\0'; off += ret) {
|
||||
ret = grapheme_next_word_break_utf8(s + off, SIZE_MAX);
|
||||
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
|
||||
}
|
||||
printf("\\n");
|
||||
|
||||
/* do the same, but this time string is length-delimited */
|
||||
len = 17;
|
||||
printf("words in input delimited to %zu bytes:\\n", len);
|
||||
for (off = 0; off < len; off += ret) {
|
||||
ret = grapheme_next_word_break_utf8(s + off, len - off);
|
||||
printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
.Ed
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_next_word_break 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_next_word_break_utf8
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
@@ -0,0 +1,4 @@
|
||||
ENCODING="utf8" \
|
||||
TYPE="word" \
|
||||
REALTYPE="word" \
|
||||
$SH man/template/next_break.sh
|
56
libs/libgrapheme-2.0.2/man/grapheme_to_lowercase.3
Normal file
56
libs/libgrapheme-2.0.2/man/grapheme_to_lowercase.3
Normal file
@@ -0,0 +1,56 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_TO_LOWERCASE 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_to_lowercase
|
||||
.Nd convert codepoint array to lowercase
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_to_lowercase "const uint_least32_t *src" "size_t srclen" "uint_least32_t *dest" "size_t destlen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_to_lowercase
|
||||
function converts the codepoint array
|
||||
.Va str
|
||||
to lowercase and writes the result to
|
||||
.Va dest
|
||||
up to
|
||||
.Va destlen ,
|
||||
unless
|
||||
.Va dest
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va srclen
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the codepoint array
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For UTF-8-encoded input data
|
||||
.Xr grapheme_to_lowercase_utf8 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_to_lowercase
|
||||
function returns the number of codepoints in the array resulting
|
||||
from converting
|
||||
.Va src
|
||||
to lowercase, even if
|
||||
.Va destlen
|
||||
is not large enough or
|
||||
.Va dest
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_to_lowercase_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_to_lowercase
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
3
libs/libgrapheme-2.0.2/man/grapheme_to_lowercase.sh
Normal file
3
libs/libgrapheme-2.0.2/man/grapheme_to_lowercase.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
ENCODING="codepoint" \
|
||||
CASE="lowercase" \
|
||||
$SH man/template/to_case.sh
|
56
libs/libgrapheme-2.0.2/man/grapheme_to_lowercase_utf8.3
Normal file
56
libs/libgrapheme-2.0.2/man/grapheme_to_lowercase_utf8.3
Normal file
@@ -0,0 +1,56 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_TO_LOWERCASE_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_to_lowercase_utf8
|
||||
.Nd convert UTF-8-encoded string to lowercase
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_to_lowercase_utf8 "const char *src" "size_t srclen" "char *dest" "size_t destlen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_to_lowercase_utf8
|
||||
function converts the UTF-8-encoded string
|
||||
.Va str
|
||||
to lowercase and writes the result to
|
||||
.Va dest
|
||||
up to
|
||||
.Va destlen ,
|
||||
unless
|
||||
.Va dest
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va srclen
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the UTF-8-encoded string
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For non-UTF-8 input data
|
||||
.Xr grapheme_to_lowercase 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_to_lowercase_utf8
|
||||
function returns the number of bytes in the array resulting
|
||||
from converting
|
||||
.Va src
|
||||
to lowercase, even if
|
||||
.Va destlen
|
||||
is not large enough or
|
||||
.Va dest
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_to_lowercase 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_to_lowercase_utf8
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
3
libs/libgrapheme-2.0.2/man/grapheme_to_lowercase_utf8.sh
Normal file
3
libs/libgrapheme-2.0.2/man/grapheme_to_lowercase_utf8.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
ENCODING="utf8" \
|
||||
CASE="lowercase" \
|
||||
$SH man/template/to_case.sh
|
56
libs/libgrapheme-2.0.2/man/grapheme_to_titlecase.3
Normal file
56
libs/libgrapheme-2.0.2/man/grapheme_to_titlecase.3
Normal file
@@ -0,0 +1,56 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_TO_TITLECASE 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_to_titlecase
|
||||
.Nd convert codepoint array to titlecase
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_to_titlecase "const uint_least32_t *src" "size_t srclen" "uint_least32_t *dest" "size_t destlen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_to_titlecase
|
||||
function converts the codepoint array
|
||||
.Va str
|
||||
to titlecase and writes the result to
|
||||
.Va dest
|
||||
up to
|
||||
.Va destlen ,
|
||||
unless
|
||||
.Va dest
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va srclen
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the codepoint array
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For UTF-8-encoded input data
|
||||
.Xr grapheme_to_titlecase_utf8 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_to_titlecase
|
||||
function returns the number of codepoints in the array resulting
|
||||
from converting
|
||||
.Va src
|
||||
to titlecase, even if
|
||||
.Va destlen
|
||||
is not large enough or
|
||||
.Va dest
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_to_titlecase_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_to_titlecase
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
3
libs/libgrapheme-2.0.2/man/grapheme_to_titlecase.sh
Normal file
3
libs/libgrapheme-2.0.2/man/grapheme_to_titlecase.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
ENCODING="codepoint" \
|
||||
CASE="titlecase" \
|
||||
$SH man/template/to_case.sh
|
56
libs/libgrapheme-2.0.2/man/grapheme_to_titlecase_utf8.3
Normal file
56
libs/libgrapheme-2.0.2/man/grapheme_to_titlecase_utf8.3
Normal file
@@ -0,0 +1,56 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_TO_TITLECASE_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_to_titlecase_utf8
|
||||
.Nd convert UTF-8-encoded string to titlecase
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_to_titlecase_utf8 "const char *src" "size_t srclen" "char *dest" "size_t destlen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_to_titlecase_utf8
|
||||
function converts the UTF-8-encoded string
|
||||
.Va str
|
||||
to titlecase and writes the result to
|
||||
.Va dest
|
||||
up to
|
||||
.Va destlen ,
|
||||
unless
|
||||
.Va dest
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va srclen
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the UTF-8-encoded string
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For non-UTF-8 input data
|
||||
.Xr grapheme_to_titlecase 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_to_titlecase_utf8
|
||||
function returns the number of bytes in the array resulting
|
||||
from converting
|
||||
.Va src
|
||||
to titlecase, even if
|
||||
.Va destlen
|
||||
is not large enough or
|
||||
.Va dest
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_to_titlecase 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_to_titlecase_utf8
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
3
libs/libgrapheme-2.0.2/man/grapheme_to_titlecase_utf8.sh
Normal file
3
libs/libgrapheme-2.0.2/man/grapheme_to_titlecase_utf8.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
ENCODING="utf8" \
|
||||
CASE="titlecase" \
|
||||
$SH man/template/to_case.sh
|
56
libs/libgrapheme-2.0.2/man/grapheme_to_uppercase.3
Normal file
56
libs/libgrapheme-2.0.2/man/grapheme_to_uppercase.3
Normal file
@@ -0,0 +1,56 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_TO_UPPERCASE 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_to_uppercase
|
||||
.Nd convert codepoint array to uppercase
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_to_uppercase "const uint_least32_t *src" "size_t srclen" "uint_least32_t *dest" "size_t destlen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_to_uppercase
|
||||
function converts the codepoint array
|
||||
.Va str
|
||||
to uppercase and writes the result to
|
||||
.Va dest
|
||||
up to
|
||||
.Va destlen ,
|
||||
unless
|
||||
.Va dest
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va srclen
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the codepoint array
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For UTF-8-encoded input data
|
||||
.Xr grapheme_to_uppercase_utf8 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_to_uppercase
|
||||
function returns the number of codepoints in the array resulting
|
||||
from converting
|
||||
.Va src
|
||||
to uppercase, even if
|
||||
.Va destlen
|
||||
is not large enough or
|
||||
.Va dest
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_to_uppercase_utf8 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_to_uppercase
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
3
libs/libgrapheme-2.0.2/man/grapheme_to_uppercase.sh
Normal file
3
libs/libgrapheme-2.0.2/man/grapheme_to_uppercase.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
ENCODING="codepoint" \
|
||||
CASE="uppercase" \
|
||||
$SH man/template/to_case.sh
|
56
libs/libgrapheme-2.0.2/man/grapheme_to_uppercase_utf8.3
Normal file
56
libs/libgrapheme-2.0.2/man/grapheme_to_uppercase_utf8.3
Normal file
@@ -0,0 +1,56 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt GRAPHEME_TO_LOWERCASE_UTF8 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_to_lowercase_utf8
|
||||
.Nd convert UTF-8-encoded string to lowercase
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_to_lowercase_utf8 "const char *src" "size_t srclen" "char *dest" "size_t destlen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_to_lowercase_utf8
|
||||
function converts the UTF-8-encoded string
|
||||
.Va str
|
||||
to lowercase and writes the result to
|
||||
.Va dest
|
||||
up to
|
||||
.Va destlen ,
|
||||
unless
|
||||
.Va dest
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va srclen
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the UTF-8-encoded string
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For non-UTF-8 input data
|
||||
.Xr grapheme_to_lowercase 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_to_lowercase_utf8
|
||||
function returns the number of bytes in the array resulting
|
||||
from converting
|
||||
.Va src
|
||||
to lowercase, even if
|
||||
.Va destlen
|
||||
is not large enough or
|
||||
.Va dest
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_to_lowercase 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_to_lowercase_utf8
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
3
libs/libgrapheme-2.0.2/man/grapheme_to_uppercase_utf8.sh
Normal file
3
libs/libgrapheme-2.0.2/man/grapheme_to_uppercase_utf8.sh
Normal file
@@ -0,0 +1,3 @@
|
||||
ENCODING="utf8" \
|
||||
CASE="lowercase" \
|
||||
$SH man/template/to_case.sh
|
165
libs/libgrapheme-2.0.2/man/libgrapheme.7
Normal file
165
libs/libgrapheme-2.0.2/man/libgrapheme.7
Normal file
@@ -0,0 +1,165 @@
|
||||
.Dd 2022-10-06
|
||||
.Dt LIBGRAPHEME 7
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm libgrapheme
|
||||
.Nd unicode string library
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Nm
|
||||
library provides functions to properly handle Unicode strings according
|
||||
to the Unicode specification in regard to character, word, sentence and
|
||||
line segmentation and case detection and conversion.
|
||||
.Pp
|
||||
Unicode strings are made up of user-perceived characters (so-called
|
||||
.Dq grapheme clusters ,
|
||||
see
|
||||
.Sx MOTIVATION )
|
||||
that are composed of one or more Unicode codepoints, which in turn
|
||||
are encoded in one or more bytes in an encoding like UTF-8.
|
||||
.Pp
|
||||
There is a widespread misconception that it was enough to simply
|
||||
determine codepoints in a string and treat them as user-perceived
|
||||
characters to be Unicode compliant.
|
||||
While this may work in some cases, this assumption quickly breaks,
|
||||
especially for non-Western languages and decomposed Unicode strings
|
||||
where user-perceived characters are usually represented using multiple
|
||||
codepoints.
|
||||
.Pp
|
||||
Despite this complicated multilevel structure of Unicode strings,
|
||||
.Nm
|
||||
provides methods to work with them at the byte-level (i.e. UTF-8
|
||||
.Sq char
|
||||
arrays) while also offering codepoint-level methods.
|
||||
Additionally, it is a
|
||||
.Dq freestanding
|
||||
library (see ISO/IEC 9899:1999 section 4.6) and thus does not depend on
|
||||
a standard library. This makes it easy to use in bare metal environments.
|
||||
.Pp
|
||||
Every documented function's manual page provides a self-contained
|
||||
example illustrating the possible usage.
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_decode_utf8 3 ,
|
||||
.Xr grapheme_encode_utf8 3 ,
|
||||
.Xr grapheme_is_character_break 3 ,
|
||||
.Xr grapheme_is_lowercase 3 ,
|
||||
.Xr grapheme_is_lowercase_utf8 3 ,
|
||||
.Xr grapheme_is_titlecase 3 ,
|
||||
.Xr grapheme_is_titlecase_utf8 3 ,
|
||||
.Xr grapheme_is_uppercase 3 ,
|
||||
.Xr grapheme_is_uppercase_utf8 3 ,
|
||||
.Xr grapheme_next_character_break 3 ,
|
||||
.Xr grapheme_next_character_break_utf8 3 ,
|
||||
.Xr grapheme_next_line_break 3 ,
|
||||
.Xr grapheme_next_line_break_utf8 3 ,
|
||||
.Xr grapheme_next_sentence_break 3 ,
|
||||
.Xr grapheme_next_sentence_break_utf8 3 ,
|
||||
.Xr grapheme_next_word_break 3 ,
|
||||
.Xr grapheme_next_word_break_utf8 3 ,
|
||||
.Xr grapheme_to_lowercase 3 ,
|
||||
.Xr grapheme_to_lowercase_utf8 3 ,
|
||||
.Xr grapheme_to_titlecase 3 ,
|
||||
.Xr grapheme_to_titlecase_utf8 3
|
||||
.Xr grapheme_to_uppercase 3 ,
|
||||
.Xr grapheme_to_uppercase_utf8 3 ,
|
||||
.Sh STANDARDS
|
||||
.Nm
|
||||
is compliant with the Unicode 15.0.0 specification.
|
||||
.Sh MOTIVATION
|
||||
The idea behind every character encoding scheme like ASCII or Unicode
|
||||
is to express abstract characters (which can be thought of as shapes
|
||||
making up a written language). ASCII for instance, which comprises the
|
||||
range 0 to 127, assigns the number 65 (0x41) to the abstract character
|
||||
.Sq A .
|
||||
This number is called a
|
||||
.Dq codepoint ,
|
||||
and all codepoints of an encoding make up its so-called
|
||||
.Dq code space .
|
||||
.Pp
|
||||
Unicode's code space is much larger, ranging from 0 to 0x10FFFF, but its
|
||||
first 128 codepoints are identical to ASCII's. The additional code
|
||||
points are needed as Unicode's goal is to express all writing systems
|
||||
of the world.
|
||||
To give an example, the abstract character
|
||||
.Sq \[u00C4]
|
||||
is not expressable in ASCII, given no ASCII codepoint has been assigned
|
||||
to it.
|
||||
It can be expressed in Unicode, though, with the codepoint 196 (0xC4).
|
||||
.Pp
|
||||
One may assume that this process is straightfoward, but as more and
|
||||
more codepoints were assigned to abstract characters, the Unicode
|
||||
Consortium (that defines the Unicode standard) was facing a problem:
|
||||
Many (mostly non-European) languages have such a large amount of
|
||||
abstract characters that it would exhaust the available Unicode code
|
||||
space if one tried to assign a codepoint to each abstract character.
|
||||
The solution to that problem is best introduced with an example: Consider
|
||||
the abstract character
|
||||
.Sq \[u01DE] ,
|
||||
which is
|
||||
.Sq A
|
||||
with an umlaut and a macron added to it.
|
||||
In this sense, one can consider
|
||||
.Sq \[u01DE]
|
||||
as a two-fold modification (namely
|
||||
.Dq add umlaut
|
||||
and
|
||||
.Dq add macron )
|
||||
of the
|
||||
.Dq base character
|
||||
.Sq A .
|
||||
.Pp
|
||||
The Unicode Consortium adapted this idea by assigning codepoints to
|
||||
modifications.
|
||||
For example, the codepoint 0x308 represents adding an umlaut and 0x304
|
||||
represents adding a macron, and thus, the codepoint sequence
|
||||
.Dq 0x41 0x308 0x304 ,
|
||||
namely the base character
|
||||
.Sq A
|
||||
followed by the umlaut and macron modifiers, represents the abstract
|
||||
character
|
||||
.Sq \[u01DE] .
|
||||
As a side-note, the single codepoint 0x1DE was also assigned to
|
||||
.Sq \[u01DE] ,
|
||||
which is a good example for the fact that there can be multiple
|
||||
representations of a single abstract character in Unicode.
|
||||
.Pp
|
||||
Expressing a single abstract character with multiple codepoints solved
|
||||
the code space exhaustion-problem, and the concept has been greatly
|
||||
expanded since its first introduction (emojis, joiners, etc.). A sequence
|
||||
(which can also have the length 1) of codepoints that belong together
|
||||
this way and represents an abstract character is called a
|
||||
.Dq grapheme cluster .
|
||||
.Pp
|
||||
In many applications it is necessary to count the number of
|
||||
user-perceived characters, i.e. grapheme clusters, in a string.
|
||||
A good example for this is a terminal text editor, which needs to
|
||||
properly align characters on a grid.
|
||||
This is pretty simple with ASCII-strings, where you just count the number
|
||||
of bytes (as each byte is a codepoint and each codepoint is a grapheme
|
||||
cluster).
|
||||
With Unicode-strings, it is a common mistake to simply adapt the
|
||||
ASCII-approach and count the number of code points.
|
||||
This is wrong, as, for example, the sequence
|
||||
.Dq 0x41 0x308 0x304 ,
|
||||
while made up of 3 codepoints, is a single grapheme cluster and
|
||||
represents the user-perceived character
|
||||
.Sq \[u01DE] .
|
||||
.Pp
|
||||
The proper way to segment a string into user-perceived characters
|
||||
is to segment it into its grapheme clusters by applying the Unicode
|
||||
grapheme cluster breaking algorithm (UAX #29).
|
||||
It is based on a complex ruleset and lookup-tables and determines if a
|
||||
grapheme cluster ends or is continued between two codepoints.
|
||||
Libraries like ICU and libunistring, which also offer this functionality,
|
||||
are often bloated, not correct, difficult to use or not reasonably
|
||||
statically linkable.
|
||||
.Pp
|
||||
Analogously, the standard provides algorithms to separate strings by
|
||||
words, sentences and lines, convert cases and compare strings.
|
||||
The motivation behind
|
||||
.Nm
|
||||
is to make unicode handling suck less and abide by the UNIX philosophy.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
167
libs/libgrapheme-2.0.2/man/libgrapheme.sh
Normal file
167
libs/libgrapheme-2.0.2/man/libgrapheme.sh
Normal file
@@ -0,0 +1,167 @@
|
||||
cat << EOF
|
||||
.Dd ${MAN_DATE}
|
||||
.Dt LIBGRAPHEME 7
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm libgrapheme
|
||||
.Nd unicode string library
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Nm
|
||||
library provides functions to properly handle Unicode strings according
|
||||
to the Unicode specification in regard to character, word, sentence and
|
||||
line segmentation and case detection and conversion.
|
||||
.Pp
|
||||
Unicode strings are made up of user-perceived characters (so-called
|
||||
.Dq grapheme clusters ,
|
||||
see
|
||||
.Sx MOTIVATION )
|
||||
that are composed of one or more Unicode codepoints, which in turn
|
||||
are encoded in one or more bytes in an encoding like UTF-8.
|
||||
.Pp
|
||||
There is a widespread misconception that it was enough to simply
|
||||
determine codepoints in a string and treat them as user-perceived
|
||||
characters to be Unicode compliant.
|
||||
While this may work in some cases, this assumption quickly breaks,
|
||||
especially for non-Western languages and decomposed Unicode strings
|
||||
where user-perceived characters are usually represented using multiple
|
||||
codepoints.
|
||||
.Pp
|
||||
Despite this complicated multilevel structure of Unicode strings,
|
||||
.Nm
|
||||
provides methods to work with them at the byte-level (i.e. UTF-8
|
||||
.Sq char
|
||||
arrays) while also offering codepoint-level methods.
|
||||
Additionally, it is a
|
||||
.Dq freestanding
|
||||
library (see ISO/IEC 9899:1999 section 4.6) and thus does not depend on
|
||||
a standard library. This makes it easy to use in bare metal environments.
|
||||
.Pp
|
||||
Every documented function's manual page provides a self-contained
|
||||
example illustrating the possible usage.
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_decode_utf8 3 ,
|
||||
.Xr grapheme_encode_utf8 3 ,
|
||||
.Xr grapheme_is_character_break 3 ,
|
||||
.Xr grapheme_is_lowercase 3 ,
|
||||
.Xr grapheme_is_lowercase_utf8 3 ,
|
||||
.Xr grapheme_is_titlecase 3 ,
|
||||
.Xr grapheme_is_titlecase_utf8 3 ,
|
||||
.Xr grapheme_is_uppercase 3 ,
|
||||
.Xr grapheme_is_uppercase_utf8 3 ,
|
||||
.Xr grapheme_next_character_break 3 ,
|
||||
.Xr grapheme_next_character_break_utf8 3 ,
|
||||
.Xr grapheme_next_line_break 3 ,
|
||||
.Xr grapheme_next_line_break_utf8 3 ,
|
||||
.Xr grapheme_next_sentence_break 3 ,
|
||||
.Xr grapheme_next_sentence_break_utf8 3 ,
|
||||
.Xr grapheme_next_word_break 3 ,
|
||||
.Xr grapheme_next_word_break_utf8 3 ,
|
||||
.Xr grapheme_to_lowercase 3 ,
|
||||
.Xr grapheme_to_lowercase_utf8 3 ,
|
||||
.Xr grapheme_to_titlecase 3 ,
|
||||
.Xr grapheme_to_titlecase_utf8 3
|
||||
.Xr grapheme_to_uppercase 3 ,
|
||||
.Xr grapheme_to_uppercase_utf8 3 ,
|
||||
.Sh STANDARDS
|
||||
.Nm
|
||||
is compliant with the Unicode ${UNICODE_VERSION} specification.
|
||||
.Sh MOTIVATION
|
||||
The idea behind every character encoding scheme like ASCII or Unicode
|
||||
is to express abstract characters (which can be thought of as shapes
|
||||
making up a written language). ASCII for instance, which comprises the
|
||||
range 0 to 127, assigns the number 65 (0x41) to the abstract character
|
||||
.Sq A .
|
||||
This number is called a
|
||||
.Dq codepoint ,
|
||||
and all codepoints of an encoding make up its so-called
|
||||
.Dq code space .
|
||||
.Pp
|
||||
Unicode's code space is much larger, ranging from 0 to 0x10FFFF, but its
|
||||
first 128 codepoints are identical to ASCII's. The additional code
|
||||
points are needed as Unicode's goal is to express all writing systems
|
||||
of the world.
|
||||
To give an example, the abstract character
|
||||
.Sq \[u00C4]
|
||||
is not expressable in ASCII, given no ASCII codepoint has been assigned
|
||||
to it.
|
||||
It can be expressed in Unicode, though, with the codepoint 196 (0xC4).
|
||||
.Pp
|
||||
One may assume that this process is straightfoward, but as more and
|
||||
more codepoints were assigned to abstract characters, the Unicode
|
||||
Consortium (that defines the Unicode standard) was facing a problem:
|
||||
Many (mostly non-European) languages have such a large amount of
|
||||
abstract characters that it would exhaust the available Unicode code
|
||||
space if one tried to assign a codepoint to each abstract character.
|
||||
The solution to that problem is best introduced with an example: Consider
|
||||
the abstract character
|
||||
.Sq \[u01DE] ,
|
||||
which is
|
||||
.Sq A
|
||||
with an umlaut and a macron added to it.
|
||||
In this sense, one can consider
|
||||
.Sq \[u01DE]
|
||||
as a two-fold modification (namely
|
||||
.Dq add umlaut
|
||||
and
|
||||
.Dq add macron )
|
||||
of the
|
||||
.Dq base character
|
||||
.Sq A .
|
||||
.Pp
|
||||
The Unicode Consortium adapted this idea by assigning codepoints to
|
||||
modifications.
|
||||
For example, the codepoint 0x308 represents adding an umlaut and 0x304
|
||||
represents adding a macron, and thus, the codepoint sequence
|
||||
.Dq 0x41 0x308 0x304 ,
|
||||
namely the base character
|
||||
.Sq A
|
||||
followed by the umlaut and macron modifiers, represents the abstract
|
||||
character
|
||||
.Sq \[u01DE] .
|
||||
As a side-note, the single codepoint 0x1DE was also assigned to
|
||||
.Sq \[u01DE] ,
|
||||
which is a good example for the fact that there can be multiple
|
||||
representations of a single abstract character in Unicode.
|
||||
.Pp
|
||||
Expressing a single abstract character with multiple codepoints solved
|
||||
the code space exhaustion-problem, and the concept has been greatly
|
||||
expanded since its first introduction (emojis, joiners, etc.). A sequence
|
||||
(which can also have the length 1) of codepoints that belong together
|
||||
this way and represents an abstract character is called a
|
||||
.Dq grapheme cluster .
|
||||
.Pp
|
||||
In many applications it is necessary to count the number of
|
||||
user-perceived characters, i.e. grapheme clusters, in a string.
|
||||
A good example for this is a terminal text editor, which needs to
|
||||
properly align characters on a grid.
|
||||
This is pretty simple with ASCII-strings, where you just count the number
|
||||
of bytes (as each byte is a codepoint and each codepoint is a grapheme
|
||||
cluster).
|
||||
With Unicode-strings, it is a common mistake to simply adapt the
|
||||
ASCII-approach and count the number of code points.
|
||||
This is wrong, as, for example, the sequence
|
||||
.Dq 0x41 0x308 0x304 ,
|
||||
while made up of 3 codepoints, is a single grapheme cluster and
|
||||
represents the user-perceived character
|
||||
.Sq \[u01DE] .
|
||||
.Pp
|
||||
The proper way to segment a string into user-perceived characters
|
||||
is to segment it into its grapheme clusters by applying the Unicode
|
||||
grapheme cluster breaking algorithm (UAX #29).
|
||||
It is based on a complex ruleset and lookup-tables and determines if a
|
||||
grapheme cluster ends or is continued between two codepoints.
|
||||
Libraries like ICU and libunistring, which also offer this functionality,
|
||||
are often bloated, not correct, difficult to use or not reasonably
|
||||
statically linkable.
|
||||
.Pp
|
||||
Analogously, the standard provides algorithms to separate strings by
|
||||
words, sentences and lines, convert cases and compare strings.
|
||||
The motivation behind
|
||||
.Nm
|
||||
is to make unicode handling suck less and abide by the UNIX philosophy.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
||||
EOF
|
67
libs/libgrapheme-2.0.2/man/template/is_case.sh
Normal file
67
libs/libgrapheme-2.0.2/man/template/is_case.sh
Normal file
@@ -0,0 +1,67 @@
|
||||
if [ "$ENCODING" = "utf8" ]; then
|
||||
UNIT="byte"
|
||||
ARRAYTYPE="UTF-8-encoded string"
|
||||
SUFFIX="_utf8"
|
||||
ANTISUFFIX=""
|
||||
DATATYPE="char"
|
||||
else
|
||||
UNIT="codepoint"
|
||||
ARRAYTYPE="codepoint array"
|
||||
SUFFIX=""
|
||||
ANTISUFFIX="_utf8"
|
||||
DATATYPE="uint_least32_t"
|
||||
fi
|
||||
|
||||
cat << EOF
|
||||
.Dd ${MAN_DATE}
|
||||
.Dt GRAPHEME_IS_$(printf "%s%s" "$CASE" "$SUFFIX" | tr [:lower:] [:upper:]) 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_is_${CASE}${SUFFIX}
|
||||
.Nd check if ${ARRAYTYPE} is ${CASE}
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_is_${CASE}${SUFFIX} "const ${DATATYPE} *str" "size_t len" "size_t *caselen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_is_${CASE}${SUFFIX}
|
||||
function checks if the ${ARRAYTYPE}
|
||||
.Va str
|
||||
is ${CASE} and writes the length of the matching ${CASE}-sequence to the integer pointed to by
|
||||
.Va caselen ,
|
||||
unless
|
||||
.Va caselen
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the ${ARRAYTYPE}
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input data
|
||||
.Xr grapheme_is_${CASE}${ANTISUFFIX} 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_is_${CASE}${SUFFIX}
|
||||
function returns
|
||||
.Dv true
|
||||
if the ${ARRAYTYPE}
|
||||
.Va str
|
||||
is ${CASE}, otherwise
|
||||
.Dv false .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_is_${CASE}${ANTISUFFIX} 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_is_${CASE}${SUFFIX}
|
||||
is compliant with the Unicode ${UNICODE_VERSION} specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
||||
EOF
|
112
libs/libgrapheme-2.0.2/man/template/next_break.sh
Normal file
112
libs/libgrapheme-2.0.2/man/template/next_break.sh
Normal file
@@ -0,0 +1,112 @@
|
||||
if [ "$ENCODING" = "utf8" ]; then
|
||||
UNIT="byte"
|
||||
SUFFIX="_utf8"
|
||||
ANTISUFFIX=""
|
||||
else
|
||||
UNIT="codepoint"
|
||||
SUFFIX=""
|
||||
ANTISUFFIX="_utf8"
|
||||
fi
|
||||
|
||||
cat << EOF
|
||||
.Dd ${MAN_DATE}
|
||||
.Dt GRAPHEME_NEXT_$(printf "%s_break%s" "$TYPE" "$SUFFIX" | tr [:lower:] [:upper:]) 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_next_${TYPE}_break${SUFFIX}
|
||||
.Nd determine ${UNIT}-offset to next ${REALTYPE} break
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_next_${TYPE}_break${SUFFIX} "const $(if [ "$ENCODING" = "utf8" ]; then printf "char"; else printf "uint_least32_t"; fi) *str" "size_t len"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_next_${TYPE}_break${SUFFIX}
|
||||
function computes the offset (in ${UNIT}s) to the next ${REALTYPE}
|
||||
break (see
|
||||
.Xr libgrapheme 7 )
|
||||
in the $(if [ "$ENCODING" = "utf8" ]; then printf "UTF-8-encoded string"; else printf "codepoint array"; fi)
|
||||
.Va str
|
||||
of length
|
||||
.Va len .$(if [ "$TYPE" != "line" ]; then printf "\nIf a ${REALTYPE} begins at
|
||||
.Va str
|
||||
this offset is equal to the length of said ${REALTYPE}."; fi)
|
||||
.Pp
|
||||
If
|
||||
.Va len
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the string
|
||||
.Va str
|
||||
is interpreted to be NUL-terminated and processing stops when
|
||||
a $(if [ "$ENCODING" = "utf8" ]; then printf "NUL-byte"; else printf "codepoint with the value 0"; fi) is encountered.
|
||||
.Pp
|
||||
For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input
|
||||
data$(if [ "$TYPE" = "character" ] && [ "$ENCODING" = "utf8" ]; then printf "\n.Xr grapheme_is_character_break 3 and"; fi)
|
||||
.Xr grapheme_next_${TYPE}_break${ANTISUFFIX} 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_next_${TYPE}_break${SUFFIX}
|
||||
function returns the offset (in ${UNIT}s) to the next ${REALTYPE}
|
||||
break in
|
||||
.Va str
|
||||
or 0 if
|
||||
.Va str
|
||||
is
|
||||
.Dv NULL .
|
||||
EOF
|
||||
|
||||
if [ "$ENCODING" = "utf8" ]; then
|
||||
cat << EOF
|
||||
.Sh EXAMPLES
|
||||
.Bd -literal
|
||||
/* cc (-static) -o example example.c -lgrapheme */
|
||||
#include <grapheme.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
/* UTF-8 encoded input */
|
||||
char *s = "T\\\\xC3\\\\xABst \\\\xF0\\\\x9F\\\\x91\\\\xA8\\\\xE2\\\\x80\\\\x8D\\\\xF0"
|
||||
"\\\\x9F\\\\x91\\\\xA9\\\\xE2\\\\x80\\\\x8D\\\\xF0\\\\x9F\\\\x91\\\\xA6 \\\\xF0"
|
||||
"\\\\x9F\\\\x87\\\\xBA\\\\xF0\\\\x9F\\\\x87\\\\xB8 \\\\xE0\\\\xA4\\\\xA8\\\\xE0"
|
||||
"\\\\xA5\\\\x80 \\\\xE0\\\\xAE\\\\xA8\\\\xE0\\\\xAE\\\\xBF!";
|
||||
size_t ret, len, off;
|
||||
|
||||
printf("Input: \\\\"%s\\\\"\\\\n", s);
|
||||
|
||||
/* print each ${REALTYPE} with byte-length */
|
||||
printf("${REALTYPE}s in NUL-delimited input:\\\\n");
|
||||
for (off = 0; s[off] != '\\\\0'; off += ret) {
|
||||
ret = grapheme_next_${TYPE}_break_utf8(s + off, SIZE_MAX);
|
||||
printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off);
|
||||
}
|
||||
printf("\\\\n");
|
||||
|
||||
/* do the same, but this time string is length-delimited */
|
||||
len = 17;
|
||||
printf("${REALTYPE}s in input delimited to %zu bytes:\\\\n", len);
|
||||
for (off = 0; off < len; off += ret) {
|
||||
ret = grapheme_next_${TYPE}_break_utf8(s + off, len - off);
|
||||
printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
.Ed
|
||||
EOF
|
||||
fi
|
||||
|
||||
cat << EOF
|
||||
.Sh SEE ALSO$(if [ "$TYPE" = "character" ] && [ "$ENCODING" != "utf8" ]; then printf "\n.Xr grapheme_is_character_break 3 ,"; fi)
|
||||
.Xr grapheme_next_${TYPE}_break${ANTISUFFIX} 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_next_${TYPE}_break${SUFFIX}
|
||||
is compliant with the Unicode ${UNICODE_VERSION} specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
||||
EOF
|
72
libs/libgrapheme-2.0.2/man/template/to_case.sh
Normal file
72
libs/libgrapheme-2.0.2/man/template/to_case.sh
Normal file
@@ -0,0 +1,72 @@
|
||||
if [ "$ENCODING" = "utf8" ]; then
|
||||
UNIT="byte"
|
||||
ARRAYTYPE="UTF-8-encoded string"
|
||||
SUFFIX="_utf8"
|
||||
ANTISUFFIX=""
|
||||
DATATYPE="char"
|
||||
else
|
||||
UNIT="codepoint"
|
||||
ARRAYTYPE="codepoint array"
|
||||
SUFFIX=""
|
||||
ANTISUFFIX="_utf8"
|
||||
DATATYPE="uint_least32_t"
|
||||
fi
|
||||
|
||||
cat << EOF
|
||||
.Dd ${MAN_DATE}
|
||||
.Dt GRAPHEME_TO_$(printf "%s%s" "$CASE" "$SUFFIX" | tr [:lower:] [:upper:]) 3
|
||||
.Os suckless.org
|
||||
.Sh NAME
|
||||
.Nm grapheme_to_${CASE}${SUFFIX}
|
||||
.Nd convert ${ARRAYTYPE} to ${CASE}
|
||||
.Sh SYNOPSIS
|
||||
.In grapheme.h
|
||||
.Ft size_t
|
||||
.Fn grapheme_to_${CASE}${SUFFIX} "const ${DATATYPE} *src" "size_t srclen" "${DATATYPE} *dest" "size_t destlen"
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Fn grapheme_to_${CASE}${SUFFIX}
|
||||
function converts the ${ARRAYTYPE}
|
||||
.Va str
|
||||
to ${CASE} and writes the result to
|
||||
.Va dest
|
||||
up to
|
||||
.Va destlen ,
|
||||
unless
|
||||
.Va dest
|
||||
is set to
|
||||
.Dv NULL .
|
||||
.Pp
|
||||
If
|
||||
.Va srclen
|
||||
is set to
|
||||
.Dv SIZE_MAX
|
||||
(stdint.h is already included by grapheme.h) the ${ARRAYTYPE}
|
||||
.Va src
|
||||
is interpreted to be NUL-terminated and processing stops when a
|
||||
NUL-byte is encountered.
|
||||
.Pp
|
||||
For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input data
|
||||
.Xr grapheme_to_${CASE}${ANTISUFFIX} 3
|
||||
can be used instead.
|
||||
.Sh RETURN VALUES
|
||||
The
|
||||
.Fn grapheme_to_${CASE}${SUFFIX}
|
||||
function returns the number of ${UNIT}s in the array resulting
|
||||
from converting
|
||||
.Va src
|
||||
to ${CASE}, even if
|
||||
.Va destlen
|
||||
is not large enough or
|
||||
.Va dest
|
||||
is
|
||||
.Dv NULL .
|
||||
.Sh SEE ALSO
|
||||
.Xr grapheme_to_${CASE}${ANTISUFFIX} 3 ,
|
||||
.Xr libgrapheme 7
|
||||
.Sh STANDARDS
|
||||
.Fn grapheme_to_${CASE}${SUFFIX}
|
||||
is compliant with the Unicode ${UNICODE_VERSION} specification.
|
||||
.Sh AUTHORS
|
||||
.An Laslo Hunhold Aq Mt dev@frign.de
|
||||
EOF
|
Reference in New Issue
Block a user