318 lines
7.3 KiB
C
318 lines
7.3 KiB
C
/* See LICENSE file for copyright and license details. */
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
#include "../grapheme.h"
|
|
#include "util.h"
|
|
|
|
static const struct {
|
|
char *arr; /* UTF-8 byte sequence */
|
|
size_t len; /* length of UTF-8 byte sequence */
|
|
size_t exp_len; /* expected length returned */
|
|
uint_least32_t exp_cp; /* expected codepoint returned */
|
|
} dec_test[] = {
|
|
{
|
|
/* empty sequence
|
|
* [ ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = NULL,
|
|
.len = 0,
|
|
.exp_len = 0,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid lead byte
|
|
* [ 11111101 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xFD },
|
|
.len = 1,
|
|
.exp_len = 1,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* valid 1-byte sequence
|
|
* [ 00000001 ] ->
|
|
* 0000001
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0x01 },
|
|
.len = 1,
|
|
.exp_len = 1,
|
|
.exp_cp = 0x1,
|
|
},
|
|
{
|
|
/* valid 2-byte sequence
|
|
* [ 11000011 10111111 ] ->
|
|
* 00011111111
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xC3, 0xBF },
|
|
.len = 2,
|
|
.exp_len = 2,
|
|
.exp_cp = 0xFF,
|
|
},
|
|
{
|
|
/* invalid 2-byte sequence (second byte missing)
|
|
* [ 11000011 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xC3 },
|
|
.len = 1,
|
|
.exp_len = 2,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 2-byte sequence (second byte malformed)
|
|
* [ 11000011 11111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xC3, 0xFF },
|
|
.len = 2,
|
|
.exp_len = 1,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 2-byte sequence (overlong encoded)
|
|
* [ 11000001 10111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xC1, 0xBF },
|
|
.len = 2,
|
|
.exp_len = 2,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* valid 3-byte sequence
|
|
* [ 11100000 10111111 10111111 ] ->
|
|
* 0000111111111111
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
|
|
.len = 3,
|
|
.exp_len = 3,
|
|
.exp_cp = 0xFFF,
|
|
},
|
|
{
|
|
/* invalid 3-byte sequence (second byte missing)
|
|
* [ 11100000 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xE0 },
|
|
.len = 1,
|
|
.exp_len = 3,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 3-byte sequence (second byte malformed)
|
|
* [ 11100000 01111111 10111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF },
|
|
.len = 3,
|
|
.exp_len = 1,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 3-byte sequence (short string, second byte malformed)
|
|
* [ 11100000 01111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xE0, 0x7F },
|
|
.len = 2,
|
|
.exp_len = 1,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 3-byte sequence (third byte missing)
|
|
* [ 11100000 10111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xE0, 0xBF },
|
|
.len = 2,
|
|
.exp_len = 3,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 3-byte sequence (third byte malformed)
|
|
* [ 11100000 10111111 01111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F },
|
|
.len = 3,
|
|
.exp_len = 2,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 3-byte sequence (overlong encoded)
|
|
* [ 11100000 10011111 10111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF },
|
|
.len = 3,
|
|
.exp_len = 3,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 3-byte sequence (UTF-16 surrogate half)
|
|
* [ 11101101 10100000 10000000 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 },
|
|
.len = 3,
|
|
.exp_len = 3,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* valid 4-byte sequence
|
|
* [ 11110011 10111111 10111111 10111111 ] ->
|
|
* 011111111111111111111
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
|
|
.len = 4,
|
|
.exp_len = 4,
|
|
.exp_cp = UINT32_C(0xFFFFF),
|
|
},
|
|
{
|
|
/* invalid 4-byte sequence (second byte missing)
|
|
* [ 11110011 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xF3 },
|
|
.len = 1,
|
|
.exp_len = 4,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 4-byte sequence (second byte malformed)
|
|
* [ 11110011 01111111 10111111 10111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF },
|
|
.len = 4,
|
|
.exp_len = 1,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 4-byte sequence (short string 1, second byte malformed)
|
|
* [ 11110011 011111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xF3, 0x7F },
|
|
.len = 2,
|
|
.exp_len = 1,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 4-byte sequence (short string 2, second byte malformed)
|
|
* [ 11110011 011111111 10111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF },
|
|
.len = 3,
|
|
.exp_len = 1,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
|
|
{
|
|
/* invalid 4-byte sequence (third byte missing)
|
|
* [ 11110011 10111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xF3, 0xBF },
|
|
.len = 2,
|
|
.exp_len = 4,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 4-byte sequence (third byte malformed)
|
|
* [ 11110011 10111111 01111111 10111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF },
|
|
.len = 4,
|
|
.exp_len = 2,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 4-byte sequence (short string, third byte malformed)
|
|
* [ 11110011 10111111 01111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F },
|
|
.len = 3,
|
|
.exp_len = 2,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 4-byte sequence (fourth byte missing)
|
|
* [ 11110011 10111111 10111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF },
|
|
.len = 3,
|
|
.exp_len = 4,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 4-byte sequence (fourth byte malformed)
|
|
* [ 11110011 10111111 10111111 01111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F },
|
|
.len = 4,
|
|
.exp_len = 3,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 4-byte sequence (overlong encoded)
|
|
* [ 11110000 10000000 10000001 10111111 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF },
|
|
.len = 4,
|
|
.exp_len = 4,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
{
|
|
/* invalid 4-byte sequence (UTF-16-unrepresentable)
|
|
* [ 11110100 10010000 10000000 10000000 ] ->
|
|
* INVALID
|
|
*/
|
|
.arr = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 },
|
|
.len = 4,
|
|
.exp_len = 4,
|
|
.exp_cp = GRAPHEME_INVALID_CODEPOINT,
|
|
},
|
|
};
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
size_t i, failed;
|
|
|
|
(void)argc;
|
|
|
|
/* UTF-8 decoder test */
|
|
for (i = 0, failed = 0; i < LEN(dec_test); i++) {
|
|
size_t len;
|
|
uint_least32_t cp;
|
|
|
|
len = grapheme_decode_utf8(dec_test[i].arr,
|
|
dec_test[i].len, &cp);
|
|
|
|
if (len != dec_test[i].exp_len ||
|
|
cp != dec_test[i].exp_cp) {
|
|
fprintf(stderr, "%s: Failed test %zu: "
|
|
"Expected (%zx,%u), but got (%zx,%u).\n",
|
|
argv[0], i, dec_test[i].exp_len,
|
|
dec_test[i].exp_cp, len, cp);
|
|
failed++;
|
|
}
|
|
}
|
|
printf("%s: %zu/%zu unit tests passed.\n", argv[0],
|
|
LEN(dec_test) - failed, LEN(dec_test));
|
|
|
|
return (failed > 0) ? 1 : 0;
|
|
}
|