CX Framework
Cross-platform C utility framework
Loading...
Searching...
No Matches
string_private_utf8.h
1#pragma once
2
3_meta_inline _Pure uint32 _strUTF8SeqLen(uint8 u)
4{
5 // single byte encoding aka ASCII
6 if (u < 0x80)
7 return 1;
8
9 if (u >= 0x80 && u <= 0xbf)
10 return 0; // continuation byte, not valid here!
11 else if (u == 0xc0 || u == 0xc1)
12 return 0; // overlong encoding of code point < 0x80
13 else if (u >= 0xc2 && u <= 0xdf)
14 return 2;
15 else if (u >= 0xe0 && u <= 0xef)
16 return 3;
17 else if (u >= 0xf0 && u <= 0xf4)
18 return 4;
19
20 return 0;
21}
22
23_meta_inline bool _strUTF8DecodeSeq(striter *_Nonnull it, uint32 len, uint8 ch, int32 *_Nullable codepoint)
24{
25 int32 ret = 0;
26
27 if (len == 2)
28 ret = ch & 0x1f;
29 else if (len == 3)
30 ret = ch & 0x0f;
31 else if (len == 4)
32 ret = ch & 0x07;
33 else
34 return false;
35
36 for (; len > 1; --len) {
37 if (!striChar(it, (uint8*)&ch))
38 return false;
39
40 if (ch < 0x80 || ch > 0xbf)
41 return false; // continuation byte must follow
42
43 ret = (ret << 6) | (ch & 0x3f);
44 }
45
46 if (ret > 0x10ffff || // outside unicode range
47 (ret >= 0xd800 && ret <= 0xdfff) || // UTF-16 surrogate pairs
48 (len == 2 && ret < 0x80) || // overlong encodings
49 (len == 3 && ret < 0x800) ||
50 (len == 4 && ret < 0x10000))
51 return false;
52
53 if (codepoint)
54 *codepoint = ret;
55
56 return true;
57}
58
59_meta_inline uint32 _strUTF8Decode(striter *_Nonnull it, int32 *_Nullable codepoint)
60{
61 uint8 first;
62 if (!striChar(it, (uint8*)&first))
63 return 0;
64
65 uint32 len = _strUTF8SeqLen(first);
66
67 if (len == 1) {
68 if (codepoint)
69 *codepoint = first;
70 return 1;
71 }
72
73 if (_strUTF8DecodeSeq(it, len, first, codepoint))
74 return len;
75 return 0;
76}
77
78_meta_inline uint32 _strUTF8Encode(uint8 *_Nonnull buffer, int32 codepoint)
79{
80 if (codepoint < 0)
81 return 0;
82 else if (codepoint < 0x80) {
83 buffer[0] = (uint8)codepoint;
84 return 1;
85 } else if (codepoint < 0x800) {
86 buffer[0] = 0xc0 | ((codepoint & 0x7c0) >> 6);
87 buffer[1] = 0x80 | ((codepoint & 0x03f));
88 return 2;
89 } else if (codepoint < 0x10000) {
90 buffer[0] = 0xe0 | ((codepoint & 0xf000) >> 12);
91 buffer[1] = 0x80 | ((codepoint & 0x0fc0) >> 6);
92 buffer[2] = 0x80 | ((codepoint & 0x003f));
93 return 3;
94 } else if (codepoint < 0x10ffff) {
95 buffer[0] = 0xf0 | ((codepoint & 0x1c0000) >> 18);
96 buffer[1] = 0x80 | ((codepoint & 0x03f000) >> 12);
97 buffer[2] = 0x80 | ((codepoint & 0x000fc0) >> 6);
98 buffer[3] = 0x80 | ((codepoint & 0x00003f));
99 return 4;
100 }
101
102 return 0;
103}
bool striChar(striter *i, uint8 *out)
Definition striter.h:307