]>
Commit | Line | Data |
---|---|---|
556826b5 OM |
1 | /* |
2 | * Copyright (c) 2009-2016 Petri Lehtinen <petri@digip.org> | |
3 | * | |
4 | * Jansson is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the MIT license. See LICENSE for details. | |
6 | */ | |
7 | ||
8 | #include <string.h> | |
9 | #include "utf.h" | |
10 | ||
11 | int utf8_encode(int32_t codepoint, char *buffer, size_t *size) | |
12 | { | |
13 | if(codepoint < 0) | |
14 | return -1; | |
15 | else if(codepoint < 0x80) | |
16 | { | |
17 | buffer[0] = (char)codepoint; | |
18 | *size = 1; | |
19 | } | |
20 | else if(codepoint < 0x800) | |
21 | { | |
22 | buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6); | |
23 | buffer[1] = 0x80 + ((codepoint & 0x03F)); | |
24 | *size = 2; | |
25 | } | |
26 | else if(codepoint < 0x10000) | |
27 | { | |
28 | buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12); | |
29 | buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6); | |
30 | buffer[2] = 0x80 + ((codepoint & 0x003F)); | |
31 | *size = 3; | |
32 | } | |
33 | else if(codepoint <= 0x10FFFF) | |
34 | { | |
35 | buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18); | |
36 | buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12); | |
37 | buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6); | |
38 | buffer[3] = 0x80 + ((codepoint & 0x00003F)); | |
39 | *size = 4; | |
40 | } | |
41 | else | |
42 | return -1; | |
43 | ||
44 | return 0; | |
45 | } | |
46 | ||
47 | size_t utf8_check_first(char byte) | |
48 | { | |
49 | unsigned char u = (unsigned char)byte; | |
50 | ||
51 | if(u < 0x80) | |
52 | return 1; | |
53 | ||
54 | if(0x80 <= u && u <= 0xBF) { | |
55 | /* second, third or fourth byte of a multi-byte | |
56 | sequence, i.e. a "continuation byte" */ | |
57 | return 0; | |
58 | } | |
59 | else if(u == 0xC0 || u == 0xC1) { | |
60 | /* overlong encoding of an ASCII byte */ | |
61 | return 0; | |
62 | } | |
63 | else if(0xC2 <= u && u <= 0xDF) { | |
64 | /* 2-byte sequence */ | |
65 | return 2; | |
66 | } | |
67 | ||
68 | else if(0xE0 <= u && u <= 0xEF) { | |
69 | /* 3-byte sequence */ | |
70 | return 3; | |
71 | } | |
72 | else if(0xF0 <= u && u <= 0xF4) { | |
73 | /* 4-byte sequence */ | |
74 | return 4; | |
75 | } | |
76 | else { /* u >= 0xF5 */ | |
77 | /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid | |
78 | UTF-8 */ | |
79 | return 0; | |
80 | } | |
81 | } | |
82 | ||
83 | size_t utf8_check_full(const char *buffer, size_t size, int32_t *codepoint) | |
84 | { | |
85 | size_t i; | |
86 | int32_t value = 0; | |
87 | unsigned char u = (unsigned char)buffer[0]; | |
88 | ||
89 | if(size == 2) | |
90 | { | |
91 | value = u & 0x1F; | |
92 | } | |
93 | else if(size == 3) | |
94 | { | |
95 | value = u & 0xF; | |
96 | } | |
97 | else if(size == 4) | |
98 | { | |
99 | value = u & 0x7; | |
100 | } | |
101 | else | |
102 | return 0; | |
103 | ||
104 | for(i = 1; i < size; i++) | |
105 | { | |
106 | u = (unsigned char)buffer[i]; | |
107 | ||
108 | if(u < 0x80 || u > 0xBF) { | |
109 | /* not a continuation byte */ | |
110 | return 0; | |
111 | } | |
112 | ||
113 | value = (value << 6) + (u & 0x3F); | |
114 | } | |
115 | ||
116 | if(value > 0x10FFFF) { | |
117 | /* not in Unicode range */ | |
118 | return 0; | |
119 | } | |
120 | ||
121 | else if(0xD800 <= value && value <= 0xDFFF) { | |
122 | /* invalid code point (UTF-16 surrogate halves) */ | |
123 | return 0; | |
124 | } | |
125 | ||
126 | else if((size == 2 && value < 0x80) || | |
127 | (size == 3 && value < 0x800) || | |
128 | (size == 4 && value < 0x10000)) { | |
129 | /* overlong encoding */ | |
130 | return 0; | |
131 | } | |
132 | ||
133 | if(codepoint) | |
134 | *codepoint = value; | |
135 | ||
136 | return 1; | |
137 | } | |
138 | ||
139 | const char *utf8_iterate(const char *buffer, size_t bufsize, int32_t *codepoint) | |
140 | { | |
141 | size_t count; | |
142 | int32_t value; | |
143 | ||
144 | if(!bufsize) | |
145 | return buffer; | |
146 | ||
147 | count = utf8_check_first(buffer[0]); | |
148 | if(count <= 0) | |
149 | return NULL; | |
150 | ||
151 | if(count == 1) | |
152 | value = (unsigned char)buffer[0]; | |
153 | else | |
154 | { | |
155 | if(count > bufsize || !utf8_check_full(buffer, count, &value)) | |
156 | return NULL; | |
157 | } | |
158 | ||
159 | if(codepoint) | |
160 | *codepoint = value; | |
161 | ||
162 | return buffer + count; | |
163 | } | |
164 | ||
165 | int utf8_check_string(const char *string, size_t length) | |
166 | { | |
167 | size_t i; | |
168 | ||
169 | for(i = 0; i < length; i++) | |
170 | { | |
171 | size_t count = utf8_check_first(string[i]); | |
172 | if(count == 0) | |
173 | return 0; | |
174 | else if(count > 1) | |
175 | { | |
176 | if(count > length - i) | |
177 | return 0; | |
178 | ||
179 | if(!utf8_check_full(&string[i], count, NULL)) | |
180 | return 0; | |
181 | ||
182 | i += count - 1; | |
183 | } | |
184 | } | |
185 | ||
186 | return 1; | |
187 | } |