code cleanup: utfconv library had some quite confusing formatting, also cleared som...
[blender.git] / intern / utfconv / utfconv.c
1 /*
2  * ***** BEGIN GPL LICENSE BLOCK *****
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version. 
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  *
18  * The Original Code is Copyright (C) 2009 Blender Foundation.
19  * All rights reserved.
20  * 
21  * Contributor(s): Alexandr Kuznetsov, Andrea Weikert
22  *
23  * ***** END GPL LICENSE BLOCK *****
24  */
25
26 #include "utfconv.h"
27
28 size_t count_utf_8_from_16(const wchar_t *string16)
29 {
30         int i;
31         size_t count = 0;
32         wchar_t u = 0;
33         if (!string16) {
34                 return 0;
35         }
36
37         for (i = 0; u = string16[i]; i++) {
38                 if (u < 0x0080) {
39                         count += 1;
40                 }
41                 else {
42                         if (u < 0x0800) {
43                                 count += 2;
44                         }
45                         else {
46                                 if (u < 0xD800) {
47                                         count += 3;
48                                 }
49                                 else {
50                                         if (u < 0xDC00) {
51                                                 i++;
52                                                 if ((u = string16[i]) == 0) {
53                                                         break;
54                                                 }
55                                                 if (u >= 0xDC00 && u < 0xE000) {
56                                                         count += 4;
57                                                 }
58                                         }
59                                         else {
60                                                 if (u < 0xE000) {
61                                                         /*illigal*/;
62                                                 }
63                                                 else {
64                                                         count += 3;
65                                                 }
66                                         }
67                                 }
68                         }
69                 }
70         }
71
72         return ++count;
73 }
74
75
76 size_t count_utf_16_from_8(const char *string8)
77 {
78         size_t count = 0;
79         char u;
80         char type = 0;
81         unsigned int u32 = 0;
82
83         if (!string8) return 0;
84
85         for (; (u = *string8); string8++) {
86                 if (type == 0) {
87                         if ((u & 0x01 << 7) == 0)     { count++; u32 = 0; continue; }       //1 utf-8 char
88                         if ((u & 0x07 << 5) == 0xC0)  { type = 1; u32 = u & 0x1F; continue; } //2 utf-8 char
89                         if ((u & 0x0F << 4) == 0xE0)  { type = 2; u32 = u & 0x0F; continue; } //3 utf-8 char
90                         if ((u & 0x1F << 3) == 0xF0)  { type = 3; u32 = u & 0x07; continue; } //4 utf-8 char
91                         continue;
92                 }
93                 else {
94                         if ((u & 0xC0) == 0x80) {
95                                 u32 = (u32 << 6) | (u & 0x3F);
96                                 type--;
97                         }
98                         else {
99                                 u32 = 0;
100                                 type = 0;
101                         }
102                 }
103
104                 if (type == 0) {
105                         if ((0 < u32 && u32 < 0xD800) || (0xE000 <= u32 && u32 < 0x10000)) count++;
106                         else if (0x10000 <= u32 && u32 < 0x110000) count += 2;
107                         u32 = 0;
108                 }
109         }
110
111         return ++count;
112 }
113
114
115 int conv_utf_16_to_8(const wchar_t *in16, char *out8, size_t size8)
116 {
117         char *out8end = out8 + size8;
118         wchar_t u = 0;
119         int err = 0;
120         if (!size8 || !in16 || !out8) return UTF_ERROR_NULL_IN;
121         out8end--;
122
123         for (; out8 < out8end && (u = *in16); in16++, out8++) {
124                 if (u < 0x0080) {
125                         *out8 = u;
126                 }
127                 else if (u < 0x0800) {
128                         if (out8 + 1 >= out8end) break;
129                         *out8++ = (0x3 << 6) | (0x1F & (u >> 6));
130                         *out8  = (0x1 << 7) | (0x3F & (u));
131                 }
132                 else if (u < 0xD800 || u >= 0xE000) {
133                         if (out8 + 2 >= out8end) break;
134                         *out8++ = (0x7 << 5) | (0xF & (u >> 12));
135                         *out8++ = (0x1 << 7) | (0x3F & (u >> 6));;
136                         *out8  = (0x1 << 7) | (0x3F & (u));
137                 }
138                 else if (u < 0xDC00) {
139                         wchar_t u2 = *++in16;
140
141                         if (!u2) break;
142                         if (u2 >= 0xDC00 && u2 < 0xE000) {
143                                 if (out8 + 3 >= out8end) break; else {
144                                         unsigned int uc = 0x10000 + (u2 - 0xDC00) + ((u - 0xD800) << 10);
145
146                                         *out8++ = (0xF << 4) | (0x7 & (uc >> 18));
147                                         *out8++ = (0x1 << 7) | (0x3F & (uc >> 12));
148                                         *out8++ = (0x1 << 7) | (0x3F & (uc >> 6));
149                                         *out8  = (0x1 << 7) | (0x3F & (uc));
150                                 }
151                         }
152                         else {
153                                 out8--; err |= UTF_ERROR_ILLCHAR;
154                         }
155                 }
156                 else if (u < 0xE000) {
157                         out8--; err |= UTF_ERROR_ILLCHAR;
158                 }
159         }
160
161         *out8 = *out8end = 0;
162
163         if (*in16) err |= UTF_ERROR_SMALL;
164
165         return err;
166 }
167
168
169 int conv_utf_8_to_16(const char *in8, wchar_t *out16, size_t size16)
170 {
171         char u;
172         char type = 0;
173         wchar_t u32 = 0;
174         wchar_t *out16end = out16 + size16;
175         int err = 0;
176         if (!size16 || !in8 || !out16) return UTF_ERROR_NULL_IN;
177         out16end--;
178
179         for (; out16 < out16end && (u = *in8); in8++) {
180                 if (type == 0) {
181                         if ((u & 0x01 << 7) == 0)     { *out16 = u; out16++; u32 = 0; continue; } //1 utf-8 char
182                         if ((u & 0x07 << 5) == 0xC0)  { type = 1; u32 = u & 0x1F; continue; }     //2 utf-8 char
183                         if ((u & 0x0F << 4) == 0xE0)  { type = 2; u32 = u & 0x0F; continue; }     //3 utf-8 char
184                         if ((u & 0x1F << 3) == 0xF0)  { type = 3; u32 = u & 0x07; continue; }     //4 utf-8 char
185                         err |= UTF_ERROR_ILLCHAR;
186                         continue;
187                 }
188                 else {
189                         if ((u & 0xC0) == 0x80) {
190                                 u32 = (u32 << 6) | (u & 0x3F);
191                                 type--;
192                         }
193                         else {
194                                 u32 = 0; type = 0; err |= UTF_ERROR_ILLSEQ;
195                         }
196                 }
197                 if (type == 0) {
198                         if ((0 < u32 && u32 < 0xD800) || (0xE000 <= u32 && u32 < 0x10000)) {
199                                 *out16 = u32;
200                                 out16++;
201                         }
202                         else if (0x10000 <= u32 && u32 < 0x110000) {
203                                 if (out16 + 1 >= out16end) break;
204                                 u32 -= 0x10000;
205                                 *out16 = 0xD800 + (u32 >> 10);
206                                 out16++;
207                                 *out16 = 0xDC00 + (u32 & 0x3FF);
208                                 out16++;
209                         }
210                         u32 = 0;
211                 }
212
213         }
214
215         *out16 = *out16end = 0;
216
217         if (*in8) err |= UTF_ERROR_SMALL;
218
219         return err;
220 }
221
222 int is_ascii(const char *in8)
223 {
224         for (; *in8; in8++)
225                 if (0x80 & *in8) return 0;
226
227         return 1;
228 }
229
230 void utf_8_cut_end(char *inout8, size_t maxcutpoint)
231 {
232         char *cur = inout8 + maxcutpoint;
233         char cc;
234         if (!inout8) return;
235
236         cc = *cur;
237 }
238
239
240
241 char *alloc_utf_8_from_16(const wchar_t *in16, size_t add)
242 {
243         size_t bsize = count_utf_8_from_16(in16);
244         char *out8 = NULL;
245         if (!bsize) return NULL;
246         out8 = (char *)malloc(sizeof(char) * (bsize + add));
247         conv_utf_16_to_8(in16, out8, bsize);
248         return out8;
249 }
250
251 wchar_t *alloc_utf16_from_8(const char *in8, size_t add)
252 {
253         size_t bsize = count_utf_16_from_8(in8);
254         wchar_t *out16 = NULL;
255         if (!bsize) return NULL;
256         out16 = (wchar_t *) malloc(sizeof(wchar_t) * (bsize + add));
257         conv_utf_8_to_16(in8, out16, bsize);
258         return out16;
259 }